Note: project github page: https://github.com/mlagisz/survey_best_paper_awards

Load Scimago Subject Area -level data set and associated meta-data table.

# accessing all the sheets 
#sheet_names <- excel_sheets(here("data", "scimagojr 2021  Subject Areas.xlsx"))
sheet_names <- excel_sheets(here("data", "scimagojr 2021  Subject Areas_screening.xlsx"))
sheet_names <- sheet_names[-1]#remove first sheet with meta-data
SA_list_all <- lapply(sheet_names, function(x) {as.data.frame(read_excel(here("data", "scimagojr 2021  Subject Areas.xlsx"), sheet = x))}) #read all sheets to list
names(SA_list_all) <- sheet_names  #rename list elements 
#lapply(SA_list_all, names) #extract column names
#View(do.call(rbind, lapply(SA_list_all, names))) #View all column names as a table
#unique(do.call(rbind, lapply(SA_list_all, names))) #making sure all have same column names
SAdata <- do.call(rbind, SA_list_all)
#str(SAdata) #one large data frame
#names(SAdata)
names(SAdata) <- gsub(" ","_", names(SAdata)) #change spaces to _ in the column names
names(SAdata) <- gsub("\\.","", names(SAdata)) #remove . the column names
#dim(SAdata)
SAdata <- SAdata %>% drop_na(Subject_area)
#table(is.na(SAdata$Title)) #no empty journal names

SAmeta <- read_excel(here("data", "scimagojr 2021  Subject Areas_screening.xlsx"), sheet = 1, skip = 1) #load Scimago SA meta-data
#dim(SAmeta)
#names(SAmeta)

Load award-level data set and associated meta-data table.

#BP load and prepare award-level meta-data
BPmeta <- read_excel(here("data", "Survey-Best_paper_awards (Responses)_SHAREDCOPY_checked.xlsx"), sheet = 2, skip = 1) #load main award #BPmeta <- read_excel("./data/Survey-Best_paper_awards (Responses)_SHAREDCOPY_checked.xlsx", sheet = 2, skip = 1) #load main award meta-data
#dim(BPmeta)
#names(BPmeta)

## load and prepare main extracted award-level data
BPdata <- read_excel(here("data","Survey-Best_paper_awards (Responses)_SHAREDCOPY_checked.xlsx"), sheet = 1) #load main award data
#BPdata <- read_excel("./data/Survey-Best_paper_awards (Responses)_SHAREDCOPY_checked.xlsx", sheet = 1) #load main award data
#dim(BPdata)
#names(BPdata)

#BPdata <- select(BPdata, !starts_with("...") ) #remove last few empty columns
#BPdata_orig_colnames <- names(BPdata) #store original column names
#BPdata$Award_name <- gsub("^The ", "", BPdata$Award_name) #remove "The" at the beginning of award names
names(BPdata) <- gsub(" ","_", names(BPdata)) #change spaces to _ in the column names

#rename selected data columns
BPdata <- BPdata %>% 
rename(Extractor_name = "Name_of_the_extracting_person",
       Award_name = "Full_name_of_the_award", 
       Journal_name = "Full_name_of_the_awarding_journal", 
       Award_description = "Paste_the_information_text_describing_the_award", 
       Eligible = "Confirm_eligibility_of_the_award",
       Awarding_journal = "Full_name_of_the_awarding_journal",
       Awarding_society = "Full_name_of_the_awarding_society",
       Awarding_other = "Full_name_of_the_awarding_publisher/other_body",  
       Career_stage = "Target_career_stage_of_eligible_applicants",
       Flexible_eligibility = "Flexibility_of_the_eligibility_criteria", 
       Assessors_transparency = "Assessor_transparency", 
       Open_science = "Valuing_Open_Science",  
       Self_nomination = "Self-nomination_allowed"
       )
#names(BPdata)

##check for rows with empty "Scimago Subject Area" values
table(is.na(BPdata$Scimago_Subject_Area)) #4 rows from pilot screening
## 
## FALSE  TRUE 
##   264     4
##remove rows with pilot extractions and empty "Scimago Subject Area" values
BPdata <- BPdata[!is.na(BPdata$Scimago_Subject_Area), ]

#remove awards that were duplicate-extracted and excluded at extraction stages
BPdata <- BPdata[BPdata$Row_excluded == 0, ]
dim(BPdata) #223 rows left
## [1] 222  54
##NOTE: total of 41 rows were removed as duplicates: 18 awards were doubly- or triply- extracted (total 18 out of 223 = 8%)

#create new variable for awards with description or without
BPdata$Descr_available <- fct_collapse(BPdata$Open_science,
  available = c("no", "yes"),
  "not available" = "not available"
)

Load winners-level data set and associated meta-data table.

#load individual winnersdata
BPindiv <- read_csv(here("data", "BP_awards_lists_SHAREDCOPY - indiv_winners_20230717.csv"), skip = 1) #load individual winners data
## Rows: 1083 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (20): award_SA, award_name, award_link, individual, awardee_name, awarde...
## dbl  (2): record_count, award_year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#dim(BPindiv)
#names(BPindiv)

#load individual winners meta-data
BPindiv_meta <- read_csv(here("data", "BP_awards_lists_SHAREDCOPY - indiv_winners_meta-data.csv"), skip = 1) #load individual winners data
## Rows: 22 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): column name, description, data type [options]
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#dim(BPindiv_meta)
#names(BPindiv_meta)

Load country-level productivity SCImago data set and create associated meta-data table.

#load SCImago 2021 country productivity (documents) data downloaded from https://www.scimagojr.com/countryrank.php?year=2021&min=0&min_type=it
COprod <- read_csv(here("data", "scimagojr country rank 2021.csv"), skip = 0) #load data
## Rows: 235 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Country, Region
## dbl (7): Rank, Documents, Citable documents, Citations, Self-citations, Cita...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#dim(COprod)
#names(COprod)

#create meta-data with columns:         data type [options]
COprod_meta <- tibble("column name" = names(COprod), 
                      "description" = c("Rank of a given country across all Scimago Subject Areas.",
                                        "Country name.",
                                        "Country location classification into one of the eight SCImago Major World Regions.",
                                        "Number of documents associated with a goven country published in 2021.",
                                        "Number of citable documents associated with a goven country published in 2021.",
                                        "Number of citations to documents associated with a goven country published in 2021.",
                                        "Number of self-citations to documents associated with a goven country published in 2021.",
                                        "Number of citations per document associated with a goven country published in 2021.",
                                        "H index of a country in 2021." ),
                      "data type [options]" = c("integer",
                                                "free text",
                                                "free text",
                                                "integer",
                                                "integer",
                                                "integer",
                                                "integer",
                                                "numeric",
                                                "integer"
                                                ))
#dim(COprod_meta)
#names(COprod_meta)

1 Supplementary Methods

1.1 Project workflow.

project workflow{#id .class width = 50% height = 50%} Figure S1.
Figure S1. project workflow..

1.1.1 Award inclusion criteria

These criteria wer used to increase data consistency and improve comparisons across disciplines: We will only include awards for a single published research contribution in a format of a research article (theses / dissertations are allowed if published as a journal article). Awards must be managed by a journal, publisher, or a learned society. We will exclude awards that are discontinued, and awards that are restricted to applicants from underrepresented groups, e.g., women-only / minorities-only awards.

1.1.2 Screening

If some of these awards get excluded during data extraction, we will return to the screening stage for that subject area to find replacement awards until we have 10 awards that are eligible for inclusion, or we run out of journals to screen.

· During screening, we will see if journals note “best paper” awards on their websites, and if not, we will check if journals are run by learned societies. If so, we will check if a learned society awards a “best paper” prize that fits in a given subject area being assessed. If a society awards multiple relevant awards in journals in the same subject area, we will choose one for a highest-ranked journal, or that appears to be most prestigious (e.g., with highest monetary value, oldest). · If less than 10 awards are found for a subject area, we will also check websites of large and reputable publishers to see if they list any relevant “best paper” awards in a given subject area. · For each subject area, we will collate a maximum of 10 awards that meet our selection criteria for data extraction.

1.2 Characterise SCImago Subject Area (SA) journal lists and screening data.

1.2.1 Number of journals per SCImago Subject Area

##count journals per SA
# count_j_perSA <- SAdata %>%
#     count(Subject_area) %>%
#     arrange(desc(n)) 

#table(SAdata$Subject_area)
#min(table(SAdata$Subject_area))
#max(table(SAdata$Subject_area))

#all SA as a simple barplot
SAdata %>% 
  #filter(!is.na(Subject_area)) %>% 
  count(Subject_area) %>%
  arrange(Subject_area) %>%
  ggplot(aes(x = reorder(Subject_area, desc(Subject_area)), y = n)) +
  geom_bar(stat = "identity", position = position_dodge(0.9)) +
  coord_flip()

#### Figure S2. Counts of journal Titles per SCImago ranking lists by Subejct Area.

The numbers of journals per SA vary widely reflecting the differences in sizes of different research disciplines. Random sampling of journals would result in uneven representatation of disciplines biasing the conclusions toward being representative of the largest disciplines, potentially with largest number of relevant awards.

1.2.2 Overlap of journals across SCImago Subject Areas

#names(SAdata)
# length(unique(SAdata$Title)) #number of unique journals
# length(unique(SAdata$Title))/length(SAdata$Title) #60% unique

#reduce to top 100 from each SA
SAdata %>% 
  #filter(!is.na(Subject_area)) %>% 
  group_by(Subject_area) %>%
  top_n(100, Title) -> SAdata100

#dim(SAdata100)

#count number of duplicated journal titles across all SA top 100 journal titles:
#sum(duplicated(SAdata100$Title)) #644 duplicated journal Titles
#sum(duplicated(SAdata100$Title))/length(SAdata100$Title) #24% duplicated journal Titles (out of 2700)

#display all duplicate rows in data frame
SAdata100 %>%
  group_by(Title) %>%
  filter(!is.na(Reviewer_name)) %>%
  filter(n()>1) %>%
  ungroup() %>% 
  arrange(Title) #%>% View
## # A tibble: 30 × 35
##    Subject_area      Rank Sourceid Title Reviewer_name Link_journal Society_name
##    <chr>            <dbl>    <dbl> <chr> <chr>         <chr>        <chr>       
##  1 Immunology and …    28  1.30e 4 PLoS… Upama Aich    https://jou… <NA>        
##  2 Neuroscience        18  1.30e 4 PLoS… Joshua Wang   https://jou… <NA>        
##  3 Chemical Engine…     8  2.75e 4 Prog… Ju Zhang      https://www… <NA>        
##  4 Energy               9  2.75e 4 Prog… Andrew Pua    https://www… The Combust…
##  5 Chemical Engine…    88  1.95e 4 Shiy… Ju Zhang      http://www.… <NA>        
##  6 Energy              95  1.95e 4 Shiy… Andrew Pua    http://www.… China Assoc…
##  7 Energy              43  2.11e10 Sola… Andrew Pua    https://onl… Unknown     
##  8 Physics and Ast…    67  2.11e10 Sola… Nina Trubano… https://onl… <NA>        
##  9 Earth and Plane…    29  2.83e 4 Spac… Joanna        https://www… <NA>        
## 10 Physics and Ast…    49  2.83e 4 Spac… Nina Trubano… https://www… <NA>        
## # ℹ 20 more rows
## # ℹ 28 more variables: Has_relevant_award <chr>, Link_award <chr>,
## #   Notes_award <chr>, Checker_name <chr>, Check_pass <lgl>, Check_notes <chr>,
## #   Award_name_extract <chr>, Extract <lgl>, Award_extractor_name <chr>,
## #   Extracted <lgl>, Comments <chr>, Archived <lgl>, Issn <chr>, SJR <dbl>,
## #   SJR_Best_Quartile <chr>, H_index <dbl>, `Total_Docs_(2021)` <dbl>,
## #   `Total_Docs_(3years)` <dbl>, Total_Refs <dbl>, …

Across top 100 journals in each SA ranking list there are 644 duplicated journal titles (23.9%). This indicates that one journal can often be classfied into one or more related Subject Areas (e.g. “Trends in Cognitive Sciences” is included bot in Neuroscience and Psychology SA). This overlap implies that the related awards will also often be relevant to more than one SA. Thus, analysing awards

1.2.2.1 Table S1.

Meta-data for the SCImago journal rankings by Subject Areas dataset.

# making a scrollable table
kable(SAmeta, "html") %>%
  kable_styling("striped", position = "left") %>%
  scroll_box(width = "100%", height = "500px")
column name description data type [options]
Subject_area Name of the Scimago Subject Area (SA) representing 27 major thematic categories. categorical [one of 27 Subject Areas]
Rank Rank of a given Journal within a given Scimago Subject Area. integer
Sourceid Unique ID for the journal. integer
Title Name of the journal. free text
Reviewer_name Name of a person who reviews the journal trying to find associated eligible awards. free text
Link_journal Year when award was won/announced for a give individual winner, as reported on the award page or other documents with winner information. web link
Society_name Name of a person who won the award, as reported on the award page or other documents with winner information. free text
Has_relevant_award Gender assigned to an individual past awardee names using information from pronouns, images, names) available on award websites, personal or institutional websites, or from first names. categorical [yes; no]
Link_award Main source of information for assigning gender. Order of preference: pronouns, photo, name. web link
Notes_award Affiliation institution (usually university) assigned to a winner when the award was received using information available on award websites, personal or institutional websites, or information in the awarded paper. free text
Checker_name Name of the person who cross-checked a given row of screening data. free text
Check_pass Main source of the winner’s affiliation information (institution, country) for the year when the award was received or announced. categorical [yes; no]
Check_notes Any comments for awardee, e.g. award sub-category, or link to additional info from Internet searches. free text
Award_name_extract Whether the award is shared with other authors of the same article. categorical [yes; no]
Extract Whether a bio/profile of the winner is shown on the award page/announcement (with more extended information than affiliation information only). categorical [yes; no]
Award_extractor_name Name of the person who extracted award data . categorical [yes; no]
Extracted Whether a given award has been extracted. categorical [yes; no]
Comments Anny comments regarding screened and extracted data in a given row. free text
Archived DOI of the winning paper. DOI code in a format starting with “10.”. DOI
Issn ISSN number of teh journal free text
SJR SCImago Journal Rank indicator expresses the average number of weighted citations received in the selected year by the documents published in the selected journal in the three previous years, i.e. weighted citations received in year X to documents published in the journal in years X-1, X-2 and X-3 free text
SJR Best Quartile Anny comments regarding extracted data in a given row. categorical [Q1; Q2; Q3; Q4]
H index The h index expresses the journal’s number of articles (h) that have received at least h citations. It quantifies both journal scientific productivity and scientific impact and it is also applicable to scientists, countries, etc. number
Total Docs. (2021) Output of the selected period. All types of documents are considered, including citable and non citable documents. integer
Total Docs. (3years) Published documents in the three previous years (selected year documents are excluded), i.e.when the year X is selected, then X-1, X-2 and X-3 published documents are retrieved. All types of documents are considered, including citable and non citable documents. integer
Total Refs. Total number of all the bibliographical references in a journal in the selected period. integer
Total Cites (3years) Number of citations received in the seleted year by a journal to the documents published in the three previous years, –i.e. citations received in year X to documents published in years X-1, X-2 and X-3. All types of documents are considered. integer
Citable Docs. (3years) Number of citable documents published by a journal in the three previous years (selected year documents are excluded). Exclusively articles, reviews and conference papers are considered. integer
Cites / Doc. (2years) Average citations per document in a 2 year period. It is computed considering the number of citations received by a journal in the current year to the documents published in the two previous years, –i.e. citations received in year X to documents published in years X-1 and X-2. number
Ref. / Doc. References per Document is the average number of references per document in the selected year. number
Country Country where journal is indexed by Scopus. categorical
Region Eight Major World Regions in used to facilitate sectorial analyses. categorical
Publisher name of teh journal publisher. free text
Coverage Year range for which publication data is available on Scopus (?). free text
Categories Tags for relevant 309 specific subject categories according to Scopus® Classification. categorical

1.2.2.2 Table S2.

Meta-data for the Best Paper awards dataset.

# making a scrollable table
kable(BPmeta, "html") %>%
  kable_styling("striped", position = "left") %>%
  scroll_box(width = "100%", height = "500px")
column name description data type [options]
Timestamp System-recorded data and time of when the given row of data was submitted via the Google Form. time
Name of the extracting person Name of the extracting person: Please use your name in one consistent form (should be matching the name you gave us on EOI and recorded on GoogleSheet for award screening). It is best to always use your first and last name, to avoid confusion. free text
Scimago Subject Area Scimago Subject Area: Record name of the Scimago Subject Area (SA) you are doing extractions for - as in the Subject_area column in the GoogleSheet with journal and awards lists. Should be exactly as used in the original list of Subject Areas. categorical [one of 27 Subject Areas]
Full name of the award Full name of the award: Record specific award name, as in the Award_name_extract column in the GoogleSheet with journal and awards lists. free text
Source of the information on the award Source of the information on the award: Ideally, enter a link to an online page/document with information on award eligibility and assessment criteria. If not available, could be also a link to any document describing the award. If you cannot find any information about the award enter “NA”. You can paste more than one link separated by “;”. weblink
Paste the information text describing the award Paste the information text describing the award: Ideally, copy and paste the section of the online page/document with information on award eligibility and assessment criteria only. This description text will be used for text-mining analyses. Do not copy and paste list of past winners or generic information about the journal or society. Do not copy and paste any images. If you cannot find any information about the award enter “NA”. free text
Confirm eligibility of the award Confirm eligibility of the award: · This award is for a single published research contribution in a format of a research article (theses / dissertation are allowed if published as a journal article). · This award is managed by a journal, publisher, or a learned society. · This award is NOT discontinued or restricted to applicants from underrepresented groups, e.g., women-only / minorities-only. Options: yes (eligible award), no (not eligible - make note on the reasons below and enter NA to the following compulsory questions, then submit), unclear (make note on the reasons below) categorical [yes; no; unclear]
Comment on the award eligibility Comment on the award eligibility: Any comments, e.g. reasons for excluding this award. free text
Full name of the awarding journal Full name of the awarding journal: Record if award is associated with a specific journal. Enter “NA” if not associated with a specific journal. free text
Full name of the awarding society Full name of the awarding society: Record if award is associated with a specific society. Enter “NA” if not associated with a specific society. free text
Full name of the awarding publisher/other body Full name of the awarding publisher/other body: Record if award is associated with a specific publisher (group of journals) or awarding organisation other than learned society or journal. Enter “NA” if not associated with a specific publisher. free text
Comment on the awarding body Comment on the awarding body: Any comments, e.g. if more than one journal considered for the award. free text
Award cash Award cash: Code whether there is any cash coming with the award. - Select “yes” if award description mentions any cash given to the winners. - Select “no” if award description does not mention any cash given to the winners. Travel awards, publishing discounts, plenary talks, etc. do not count as cash awards. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Inclusivity phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not available. free text
Comment on the award cash Comment on the award cash: Add comments on cash amount, if provided, and note any other benefits winners receive, as relevant (e.g., travel awards, publishing discounts, plenary talks) free text
Target career stage of eligible applicants Target career stage of eligible applicants, as stated in the award information: As stated in the award information. More than one choice is possible. If it is for any paper in a journal/s, select “any career stage”. Options: student, early-career, mid-career, any career stage, unclear. categorical [student; early-career; mid-career; any career stage; unclear]
Comment on the career stage Comment on the career stage: Any comments, e.g. paste the phrasing of who is eligible for the award. free text
Flexibility of the eligibility criteria Flexibility of the eligibility criteria: Code whether explicitly allowing for career interruptions in eligibility timeframes: - Select “not applicable” if description only states that published research has to be performed while studying for a degree OR if there is no eligibility limit in terms of years since an event (e.g. a PhD or author’s age. - Select “yes” if the description says that there is an eligibility limit of years after a degree to apply for the. award and this limit can be extended in special cases”. - Select “no” if the description says that there is an eligibility limit of years after a degree to apply for the award but it does not mention that this limit can be extended in special cases. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Eligibility phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not applicable, not available. categorical [yes; no; not applicable; not available]
Eligibility phrasing Eligibility phrasing: Wording of the eligibility criteria in relation to career stage in the relevant documentation, if available. free text
Inclusivity statement Inclusivity statement: Code whether underrepresented/minority groups are encouraged to apply for the award (this does not mean that the award is restricted to underrepresented groups, e.g., women-only) or award information includes declarations of commitment to equity / diversity / inclusivity (EDI): - Select “yes” if award description mentions underrepresented/minority groups or EDI. - Select “no” if award description does not mention anything about underrepresented/minority groups or EDI. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: Extract this information from award descriptions only - do not include information from other documents not directly related to the award, e.g. journal/society/institutional policies or mission statements. In the next question (“Inclusivity phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. categorical [yes; no; not available]
Inclusivity phrasing Inclusivity phrasing: Wording of the inclusivity statement in the relevant documentation, if available. free text
Assessor transparency Assessor transparency: Code whether information is provided on who will be conducting assessments of candidate papers (for example, that editors-in-chief will be deciding): - Select “yes” if information is provided, e.g. names, or stating that journal editors will be doing this, or mentioning specific existing committee that is described somewhere else (e.g. on a society webpage). - Select “no” if description does not mention any information on the assessors. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). In the next question (“Assessor phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. categorical [yes; no; not available]
Assessor phrasing Assessor phrasing: Wording of the information on who will be conducting the assessments, if available. free text
Process transparency Process transparency: Code whether breakdown of the applicants / candidates by gender or geographic region is publicly available: - Select “yes” if such information is provided (e.g. on a society webpage). - Select “no” if such information is not provided. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Process phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not available. categorical [yes; no; not available]
Process phrasing Process phrasing: Wording of the information on the transparency of the process, e.g. if breakdown of the applicants / candidates by gender or geographic region is publicly available. free text
Feedback availability Feedback availability: Code whether award information included an offer of constructive feedback for unsuccessful applicants: - Select “yes” if such feedback can be provided (e.g. on request). - Select “no” if such feedback is not provided or not mentioned. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Feedback phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not available. categorical [yes; no; not available]
Feedback phrasing Feedback phrasing: Wording of the information on whether/how feedback will be provided, if available. free text
Criteria transparency Criteria transparency: Code whether assessment criteria are detailed (more than one vague sentence) or vague (often one vague sentence, e.g. “assessed on innovation and novelty”): - Select “yes” if assessment criteria are more than one vague sentence (e.g., mentions things like readability of the language, sample size, performing alternative tests/analyses to check robustness of the results, sharing data or code, registering the study/protocol - whatever is relevant to a given research field). - Select “no” if assessment criteria are only one vague sentence (e.g. award is for “best paper”, “outstanding contribution”, “excellent research”, “novel insights”, etc.). - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Criteria phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not available. categorical [yes; no; not available]
Criteria phrasing Criteria phrasing: Wording of the information on the assessment criteria, if available. free text
Valuing Open Science Valuing Open Science: Code whether any Open Science practices (data, code, materials sharing, preregistration, transparency of reporting, etc.) are explicitly included in the assessment criteria: - Select “yes” if award description mentions valuing any of the Open Science practices (e.g., sharing data or code, registering the study/protocol, performing replication studies - whatever is relevant to a given research field). - Select “no” if the award description does not mention valuing any of the Open Science practices. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Valuing Open Science phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not available. categorical [yes; no; not available]
Valuing Open Science phrasing Valuing Open Science phrasing: Wording of the information on the assessment criteria valuing Open Science practices, if available. free text
Self-nomination allowed Self-nomination allowed: Code whether candidates can self-nominate for the award. - Select “yes” if the award description explicitly mentions that candidates can self-nominate. - Select “no” if the award description states that candidates/papers need to be nominated by somebody else or does not explicitly mention that candidates can self-nominate. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Self-nomination phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not available, categorical [yes; no; not available]
Self-nomination phrasing Self-nomination phrasing: Wording of the information on how to nominate (e.g., using a checkbox on the manuscript submission form), if available. free text
Letter required Letter required: Code whether candidates are required to provide nomination/recommendation letter/s: - Select “yes” if award description explicitly mentions that candidates have to provide nomination or support letters from others. - Select “no” if the award description states that candidates/papers do not need to provide letters from somebody else or does not explicitly mention anything about such letters. - Select “not available” if there is no page/document describing the award (e.g. only the list of winners is publicly published). NOTE: In the next question (“Letter requirement phrasing”) copy and paste relevant text from the award document or make a note if no such document/information is available. Options: yes, no, not available. categorical [yes; no; not available]
Letter requirement phrasing Letter requirement phrasing: Wording of the information on the requirement for written nominations / reference letters, if available. free text
Awardee list source Awardee list source: Source of the information on the past winners - paste a link to a webpage, file name, personal information, etc., showing a list of past winners (any year span). Write “not available” if no such list exists. weblink
Awardee list most recent year Awardee list most recent year: The most recent year for which information on past awardees is available (enter whole number, e.g. 2022). Leave empty if no such list exists. integer
Awardee list earliest year Awardee list earliest year: The earliest year for which information on past awardees is available (enter whole number, e.g. 1999). Leave empty if no such list exists. integer
Comments on awardees list Comments on awardees list: Any comments for awardees list, e.g. if Internet searches might be needed to locate announcements with the names of the past winners. free text
Comments_general Comments_general: Any other notes and comments on issues, assumptions, or seeking additional information from the award committees / contacts. free text
Checked Whether a given row of extracted data has been cross-checked. categorical [yes; no]
Checker_name Name of the person who cross-checked a given row of extracted data. free text
Checker_comments Anny comments regarding extracted data in a given row. free text
Row_excluded Whether a given row of extracted data has been cross-checked. integer [1 = yes; 0 = no]
Award_individual Whether a given award recognises individual (selected) authors rather than all authors of a winning paper. categorical [yes; no]
Award_impact_metrics_mentioned Whether award criteria mention impact metrics (number of citations, downloads) as basis of winner selection. categorical [yes; no]
Award_impact_metrics_only Whether impact metrics (number of citations, downloads) ar the sole basis of winner selection. categorical [yes; no]
Award_impact_metrics_comment Quote relevent sections of award description if impact metrics are mentioned. Add any comments regarding award criteria and impact metrics. free text
Award_contact_provided Whether specific contact information (email) is provided on the award webpage or publicly available application/description documents. categorical [yes; no]
Award_integrity_mentioned Whether award description states how potential conflictes of interests will be managed categorical [yes; no; not available]
Award_integrity_comment Quote relevent sections of award description on how potential conflicts of interests will ba managed. free text
Award_cash_max_USD_pperson The maximum reported amount of cash awarded per winning paper, recalculated online into USD. Integer
N_winners_extracted Number of individual winners from years 2001-2022 extracted for this awards (in indiv_winners data set). Integer
Archived_files Whether award website and over information have been archived as pdf files in the dedicated project folder. categorical [yes; no]
Comment_extraction Anny comments regarding extracted data in a given row. free text

1.2.2.3 Table S3.

Meta-data for the individual winners dataset.

# making a scrollable table
kable(BPindiv_meta, "html") %>%
  kable_styling("striped", position = "left") %>%
  scroll_box(width = "100%", height = "500px")
column name description data type [options]
award_SA Name of the Scimago Subject Area (SA) you are doing extractions for - as in the Subject_area column in the GoogleSheet with journal and awards lists. Should be exactly as used in the original list of Subject Areas. categorical [one of 27 Subject Areas]
record_count Subsequent numbers for counting extracted winners within each award. integer
award_name Name of the award - as in the Award_name column in the main GoogleSheet with awards main data extraction. free text
award_link Weblink to the main Internet page with award description / information. weblink
individual Whether an award is individual or article-focused. Individual awards are defined as awards that are usually not shared equally between all article authors. For example the award names as a winner only the ECR authors or only the first author (unless there are more than one ECR/first authors who contributed equally). categorical [yes; no]
award_year Year when award was won/announced for a give individual winner, as reported on the award page or other documents with winner information. integer
awardee_name Name of a person who won the award, as reported on the award page or other documents with winner information. free text
awardee_gender Gender assigned to an individual past awardee names using information from pronouns, images, names) available on award websites, personal or institutional websites, or from first names. categorical [F = female; M = male; ? - unassigned]
gender_source Main source of information for assigning gender. Order of preference: pronouns, photo, name. categorical [pronouns; photo; name]
affiliation_institution Affiliation institution (usually university) assigned to a winner when the award was received using information available on award websites, personal or institutional websites, or information in the awarded paper. free text
affiliation_country Affiliation country assigned to a past winner for the year when the award was received using information available on award websites, personal or institutional websites, or information in the awarded paper. free text
affiliation_info_source Main source of the winner’s affiliation information (institution, country) for the year when the award was received or announced. categorical [award page; award announcement; article; other]
awardee_comment1 Any comments for awardee, e.g. award sub-category, or link to additional info from Internet searches. free text
shared Whether the award is shared with other authors of the same article. categorical [yes; no]
awardee_profile_shown Whether a bio/profile of the winner is shown on the award page/announcement (with more extended information than affiliation information only). categorical [yes; no]
awardee_photo_shown Whether a photo of the winner is shown on the award page/announcement. categorical [yes; no]
awardee_comment2 Any other comments on the awardee or awarded paper, e.g. award sub-category, or link to additional info from Internet searches. free text
awarded_paper_ref Bibliographic reference of the winning paper. free text [reference in any long format]
awarded_paper_doi DOI of the winning paper. DOI code in a format starting with “10.”. DOI
checked Whether a given row of extracted data has been cross-checked. categorical [yes; no]
checker_name Name of the person who cross-checked a given row of extracted data. free text
checker_comment Anny comments regarding extracted data in a given row. free text

2 Supplementary Results

2.1 Award-level dataset characteristics

Overall: Total number of unique names of awards: 222.
Total number of unique names of awards: 27.
Total number of unique names of awarding journals: 177.
Total number of unique names of awarding societies: 127.
Total number of unique names of awarding other bodies: 22.
Total number of awards without award description: 11 (4.9%)

By awarding or funding body: Total number of awards associated with a journal or a few related journals: 176 (78.9 %) Total number of awards associated with a learned society: 144 (64.6 %) Total number of awards associated with other organisation (e.g. a publisher, university, charity): 39 (17.5 %). Most commonly mentioned other organisations: 15 mention “Elsevier”, 6 mention “Wiley”).

By award starting date Award year of the earliest listed winner used as a proxy: 1923. Number of awards with the earliest listed winner within 2011-2023: 107 (48 %).

By target career stage: Total number of awards for ‘any career stage’ authors: 179 (80.3%).
Total number of awards for ‘early career’ authors: 31 (13.9%). Total number of awards for ‘student’ authors: 27 (12.1%). Total number of awards for ‘early career’ or ‘student’ authors: 20 (9%).
Total number of awards for ’mid-career’authors: 5 (2.2%).

By focus on individuals (“individual award”) or whole article (“team award”): Total number of awards for individual authors: 66 (29.6%).
Total number of awards for individual authors with any career stage being eligible: 28 (42.4%).
Total number of awards for individual authors with inflexible time limits for eligibility:21.
Total number of awards for individual authors with flexible time limits for eligibility: 5.

Summarizing other award characteristics: Total number of awards with inclusivity statement: 2 ( 0.9%).
Total number of awards with assessor transparency: 114 ( 51.1%).
Total number of awards with assessment integrity mentioned: 19 ( 8.5%).
Total number of awards with process transparency: 2 ( 0.9%).
Total number of awards with feedback available: 0 ( 0%).
Total number of awards with contact details available: 38 ( 17%).
Total number of awards with self-nomination allowed: 28 ( 12.6%). Total number of awards with required nomination letter: 38 ( 17%). Total number of awards with non-vague criteria: 40 ( 19%). Total number of awards with non-vague criteria: 21 ( 10%). Total number of awards with non-vague criteria: 8 ( 3.8%). Total number of awards which value Open Science: 1 ( 0.5%). Total number of awards with cash award: 112 ( 50.2%).

2.2 Award-level dataset characteristics

Overall: Total number of unique names of individual winners records: 1083.

##Check award names - same award could have been extracted in different SA, or different awards have same names 
# table(duplicated(BPdata$Award_name)) #0 duplicated, 223 unique

## Simple counts
# #eligible awards per SA
# count_awards_perSA <- BPdata %>% 
#     count(Scimago_Subject_Area) #%>%
#     #arrange(desc(n)) 
# 
# count_awards <- BPdata %>% 
#  # group_by(Scimago_Subject_Area) %>%
#     count(Descr_available) %>%
#     arrange(desc(n)) #description not available for 12 awards (5%)

# BPdata %>%
#   group_by(Awarding_journal) %>%
#   filter(n() > 1) %>%
#   select(Awarding_journal) %>%
#   filter(!is.na(Awarding_journal)) %>%
#   count() #no journals with more than one award across different

# BPdata %>%
#   group_by(Awarding_society) %>%
#   filter(n() > 1) %>%
#   select(Awarding_society) %>%
#   filter(!is.na(Awarding_society)) %>%
#   count() #some societies with more than one award (max.4)

BPdata %>%
  group_by(Awarding_other) %>%
  filter(n() > 0) %>%
  select(Awarding_other) %>%
  filter(!is.na(Awarding_other)) %>%
  count() #%>% View()#list of other awarding bodies mentioned with counts

# #count number of unique awards per year:
# BPindiv %>% 
#   #filter(!is.na(affiliation_country)) %>% 
#   group_by(award_year) %>% 
#   summarise(count = n_distinct(award_name))  

##How many awards per Scimago_Subject_Area
# table(BPdata$Scimago_Subject_Area) #note many SA with <10 extracted awards - this is because of many journals being shared between SA


table(BPdata$Awarding_journal == "NA") #46 not associated with a journal
table(BPdata$Awarding_society == "NA") #80 not associated with a learned society
table(BPdata$Career_stage, useNA = "always")
table(BPdata$Award_individual, useNA = "always") #62 yes
table(BPdata$Flexible_eligibility, useNA = "always") #only 5 - "yes" 
#View(BPdata[BPdata$Flexible_eligibility == "yes", ]) #see rows with "yes"

#two-way table for the Career_stage and Flexible_eligibility
table(BPdata$Career_stage, BPdata$Flexible_eligibility, useNA = "always") #most "not applicable" is for "any career stage"

#two-way table for the Flexible_eligibility and Award_individual
table(BPdata$Flexible_eligibility, BPdata$Award_individual, useNA = "always") #flexible ones are for ECRs only

#two-way table for the Career_stage and Award_individual
table(BPdata$Career_stage, BPdata$Award_individual, useNA = "always") #individual are mostly for ECRs
#BPdata$Award_name[BPdata$Career_stage == "unclear"]
#BPdata$Award_name[BPdata$Career_stage == "early-career" & BPdata$Award_individual == "no"]
#BPdata$Award_name[BPdata$Career_stage == "early-career" & BPdata$Award_individual == "no"]
#length(BPdata$Award_name[BPdata$Career_stage == "any career stage" & BPdata$Award_individual == "no"]) #151

#three-way table
table(BPdata$Career_stage, BPdata$Flexible_eligibility, BPdata$Award_individual) #most "not applicable" is for "any career stage"

#length(BPdata$Award_name[BPdata$Career_stage == "any career stage" & BPdata$Award_individual == "no"]) #151
length(BPdata$Award_name[BPdata$Award_individual == "no" & BPdata$Career_stage == "any career stage" & BPdata$Flexible_eligibility == "not applicable"]) #146 not applicable flexibility
length(BPdata$Award_name[BPdata$Award_individual == "no" & BPdata$Career_stage == "any career stage" & BPdata$Flexible_eligibility == "not available"]) #5 with no description
length(BPdata$Award_name[BPdata$Award_individual == "no" & BPdata$Career_stage == "any career stage" & BPdata$Flexible_eligibility == "yes"]) #0 with flexibility

length(BPdata$Award_name[BPdata$Award_individual == "yes" & BPdata$Career_stage == "any career stage"]) #29 not limited to specific career stage
length(BPdata$Award_name[BPdata$Award_individual == "yes" & BPdata$Flexible_eligibility == "not applicable"]) #39 not applicable flexibility
length(BPdata$Award_name[BPdata$Award_individual == "yes" & BPdata$Flexible_eligibility == "not available"]) #2 with no description
length(BPdata$Award_name[BPdata$Award_individual == "yes" & BPdata$Flexible_eligibility == "yes"]) #5 with flexibility


table(BPdata$Inclusivity_statement, useNA = "always") #only 2 "yes"
BPdata$Award_name[BPdata$Inclusivity_statement == "yes"] #see rows with "yes"

table(BPdata$Assessors_transparency, useNA = "always") #115 "yes"
#View(BPdata[BPdata$Assessors_transparency == "yes", ]) #see rows with "yes"
names(BPdata)
# sum(str_count(tolower(BPdata$Assessor_phrasing), "editor"), na.rm = TRUE) #163 - counting all mentions
# sum(str_detect(BPdata$Assessor_phrasing, "editor"), na.rm = TRUE) #37 - counting once per award

table(BPdata$Process_transparency, useNA = "always") #only 1 "yes"
#BPdata$Award_name[BPdata$Process_transparency == "yes"] #see award name with "yes"

table(BPdata$Self_nomination, useNA = "always") #28 yes
#View(BPdata[BPdata$Self_nomination == "yes", ]) #see rows with "yes"

table(BPdata$Letter_required, useNA = "always") #38 yes
#View(BPdata[BPdata$Letter_required == "yes", ]) #see rows with "yes"

table(BPdata$Letter_required, BPdata$Self_nomination, useNA = "always") #two-way table
dim(BPdata[BPdata$Letter_required == "yes" & BPdata$Self_nomination == "yes", ])[1] #mentioning letter and self-nomination 
dim(BPdata[BPdata$Letter_required == "no" & BPdata$Self_nomination == "no", ])[1] #not mentioning letter and self-nomination 

table(BPdata$Feedback_availability, useNA = "always") #0 "yes"

table(BPdata$Award_contact_provided, useNA = "always") #38 "yes"

table(BPdata$Criteria_transparency, useNA = "always") #40 yes
#View(BPdata[BPdata$Criteria_transparency == "yes", ]) #see rows with "yes"

table(BPdata$Award_impact_metrics_mentioned, useNA = "always") #21 "yes"

table(BPdata$Award_impact_metrics_only, useNA = "always") #8 "yes"

table(BPdata$Open_science, useNA = "always") #1 "yes"
View(BPdata[BPdata$Open_science == "yes", ]) #see rows with "yes": Social Sciences: Review Of Research Award  by American Educational Research Association. Two journals are considered: Review of Educational Research. Review of Research in Education

table(BPdata$Awardee_list_most_recent_year, useNA = "always") #a few <2020 and 6 NA
#View(BPdata[is.na(BPdata$Awardee_list_most_recent_year), ]) #see rows with NA

table(BPdata$Awardee_list_earliest_year, useNA = "always") #some very old, 19 NA
#View(BPdata[is.na(BPdata$Awardee_list_earliest_year), ]) #see rows with NA
#hist(BPdata$Awardee_list_earliest_year)
length(BPdata$Awardee_list_earliest_year[BPdata$Awardee_list_earliest_year > 2010 & !is.na(BPdata$Awardee_list_earliest_year)] ) #107 from 2011-2023

2.3 Figure 1

A. Screening effort - count numbers of journals screened to get 10 shortlisted awards

#load journal screening summary dataset:
BPscreening <- read_csv(here("data","BP_awards_lists_SHAREDCOPY - scimagojr_2021_SA.csv")) #load award screening summary data TODO
## Rows: 29 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (22): Subject_Area, Reveiwer_name, Reviewer_email, Screening_status (ass...
## dbl  (4): Nr, N_records_imported, N_records_screened, Extraction1_N_awards_e...
## lgl  (2): Extraction1_comments, Extraction2_winners_checking_comments
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
BPscreening <- BPscreening[!is.na(BPscreening$Subject_Area), ] #remove 2 extra rows without Subject_Area
count_awards_perSA <- BPdata %>% count(Scimago_Subject_Area) #count included awards per SA
BPscreening$N_included <- count_awards_perSA$n #add included award counts per SA
BPscreening$N_excluded <- BPscreening$N_records_screened - BPscreening$N_included 
BPscreening$Subject_Area <- as.factor(BPscreening$Subject_Area) #convert to factor

## reshape into long dataframe format:
BPscreening %>% select(Subject_Area, N_included, N_excluded) %>% gather(key = status, count, N_included:N_excluded) -> BPscreening_long

#reorder status levels and edit labels:
BPscreening_long <- BPscreening_long %>% 
  mutate(status = factor(status, levels = rev(c("N_excluded", "N_included"))))

BPscreening_long$status <- as.character(BPscreening_long$status)  #convert to character
BPscreening_long$status[BPscreening_long$count <= 5] <- "N_included_low" #use to mark SA with 5 or less awards included    

  
figure1A <- BPscreening_long %>%
    mutate(status = recode(status, 'N_excluded' = 'not found', 'N_included_low' = 'found and included (<=5)', 'N_included' = 'found and included (>5 awards)')) %>%
    ggplot(aes(x = reorder(Subject_Area, desc(Subject_Area)), y = count, fill = status)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,25,50,75,100)) +
    theme_bw() + 
    scale_fill_manual(values = c("#CDC8B1", "#8B8878", "#EEE8CD")) +
    labs(x = "Scimago Subject Area", y = "Count of screened Scimago journals", fill = "Award: ") + 
    theme(legend.position = "top", legend.justification=c(0, 1), axis.title.x = element_text(size = 10)) +
    geom_hline(yintercept = 10, linetype = "dotted", color = "red", size = 0.6) + 
    scale_x_discrete(position = "top")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

B. Include awards with and without award description

figure1B <- BPdata %>%
    count(Scimago_Subject_Area, Descr_available) %>%
    arrange(desc(n)) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Descr_available)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_bw() + 
    scale_fill_manual(values = c("#669933", "gray80")) +
    labs(x = "", y = "Count of included awards", fill = "Award description:") + 
    scale_x_discrete(labels = NULL) + labs(x = "")  + #used to remove vertical labels, also breaks = NULL
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

2.4 Award descriptions

Word counts per award description:

#add new variable with counts of words in the award description:
BPdata$Award_description_wordcount <- str_count(BPdata$Award_description, "\\w+")
#hist(str_count(BPdata$Award_description, "\\w+"), breaks = 20)

BPdata[is.na(BPdata$Scimago_Subject_Area), ] #no NA, but no description was coded as NA, so if 1 then change to 0 (no description)
## # A tibble: 0 × 56
## # ℹ 56 variables: Timestamp <dttm>, Extractor_name <chr>,
## #   Scimago_Subject_Area <chr>, Award_name <chr>,
## #   Source_of_the_information_on_the_award <chr>, Award_description <chr>,
## #   Eligible <chr>, Comment_on_the_award_eligibility <chr>,
## #   Awarding_journal <chr>, Awarding_society <chr>, Awarding_other <chr>,
## #   Comment_on_the_awarding_body <chr>, Award_cash <chr>,
## #   Comment_on_the_award_cash <chr>, Career_stage <chr>, …
BPdata$Award_description_wordcount[BPdata$Award_description_wordcount < 2] <- 0 #if 1 then change to 0 (no description)

#dim(BPdata[BPdata$Award_description_wordcount < 100, ])[1] #90 descriptions have less than 100 words! (includes 12 with no description)
#dim(BPdata[BPdata$Descr_available == "available" & BPdata$Award_description_wordcount < 150, ])[1] #120 descriptions have less than 200 words! (excludes 12 with no description), so 120/211 (75%)

# BPdata[BPdata$Award_description_wordcount > 250, c("Scimago_Subject_Area", "Award_name", "Award_description")] #18 awards (9%)
# BPdata[BPdata$Award_description_wordcount > 400, c("Scimago_Subject_Area", "Award_name", "Award_description")] #5 awards (2%)
# BPdata[BPdata$Award_description_wordcount > 500, c("Scimago_Subject_Area", "Award_name", "Award_description")] #2 awards (1%)

Median length of award description: 123.
Plot of award description lengths - overall:

figure1C <- BPdata %>% 
  filter(!is.na(Award_description_wordcount)) %>%
  ggplot(aes(x = Award_description_wordcount)) + 
  geom_histogram(binwidth = 20, fill = "grey60") +
  theme_bw() + 
  labs(x = "Number of words in award description", y = "Award count") + 
  scale_x_continuous(breaks = c(seq(0, 1000, by = 100))) +
  theme(legend.position = "none", axis.title.x = element_text(size = 10))

Plot of award description lengths - by Subject Area

BPdata %>% 
  filter(!is.na(Award_description_wordcount)) %>%
  ggplot(aes(x = Award_description_wordcount)) + 
  geom_histogram(binwidth = 20, fill = "grey60") +
  theme_bw() + 
  labs(x = "Number of words in award description", y = "Award count") + 
  scale_x_continuous(breaks = c(seq(0, 1000, by = 100))) +
  theme(legend.position = "none", axis.title.x = element_text(size = 10)) + 
  facet_wrap(~Scimago_Subject_Area, ncol = 3)

How old are the awards? NOTE: Using the year of oldest listed past winner as a proxy.

Overall:

#table(!is.na(BPdata$Awardee_list_earliest_year)) # values available

#plot overall - beeswarm
figure1D <- BPdata %>% 
  filter(!is.na(Awardee_list_earliest_year)) %>%
  ggplot(aes(y = Awardee_list_earliest_year, x = Row_excluded)) + 
  geom_beeswarm(color = "#000000", alpha = 0.2, cex = 3.3) +
  coord_flip() +
  theme_bw() + 
  labs(y = "Past winners earliest year", x = "") + 
  scale_x_discrete(labels = NULL) + labs(x = "")  + #used to remove vertical labels, also breaks = NULL
  scale_y_continuous(breaks = c(seq(1900, 2030, by = 10))) +
  theme(legend.position = "none", axis.title.x = element_text(size = 10)) +
  annotate("rect", ymin = 2001, ymax = 2022, xmin = -1.5, xmax = 1.5, alpha = .1,fill = "#008B00")

Plot by Subject Area:

#plat by SA - beeswarm
BPdata %>% 
  filter(!is.na(Awardee_list_earliest_year)) %>%
  ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = Awardee_list_earliest_year)) + 
  geom_beeswarm(color = "#000000", alpha = 0.1) +
  coord_flip() +
  theme_bw() + 
  labs(y = "Past winners earliest year", x = "Scimago Subject Area") + 
  scale_y_continuous(breaks = c(seq(1900, 2030, by = 10))) +
  theme(legend.position = "none", axis.title.x = element_text(size = 10))

Combine 4 panels and save:

#assemble the panels using patchwork package
figure1 <- (figure1A + figure1B) / figure1C / figure1D + 
  plot_layout(widths = c(1, 1), heights = c(3, 1, 1)) +
  plot_annotation(tag_levels = "A")

#ggsave(plot = figure1, here("plots", "Fig1ABCD_screening_descriptions_v0.png"), width = 18, height = 18, units = "cm", dpi = "retina", scale = 1.2)
#ggsave(plot = figure1, here("plots", "Fig1ABCD_screening_descriptions_v0.pdf"), width = 18, height = 18, units = "cm", scale = 1.2)

2.5 Figure 2

Figure 2A - Awarding bodies

# table(BPdata$Awarding_journal == "NA")
# table(BPdata$Awarding_society == "NA")
# table(BPdata$Awarding_other == "NA")

#create new variable with simple categorisation of awarding body type mentioned:
BPdata2 <- BPdata %>%
   mutate(
     Awarded_by = case_when(Awarding_journal != "NA" & Awarding_society == "NA" & Awarding_other == "NA" ~ "journal", 
     Awarding_journal == "NA" & BPdata$Awarding_society != "NA" & BPdata$Awarding_other == "NA" ~ "society", 
     Awarding_journal == "NA" & BPdata$Awarding_society == "NA" & BPdata$Awarding_other != "NA" ~ "other", 
     Awarding_journal != "NA" & BPdata$Awarding_society != "NA" & BPdata$Awarding_other == "NA" ~ "journal&society",
     Awarding_journal != "NA" & BPdata$Awarding_society != "NA" & BPdata$Awarding_other != "NA" ~ "journal&society&other", 
     Awarding_journal != "NA" & BPdata$Awarding_society == "NA" & BPdata$Awarding_other != "NA" ~ "journal&other", 
     Awarding_journal == "NA" & BPdata$Awarding_society != "NA" & BPdata$Awarding_other != "NA" ~ "society&other", 
     TRUE ~ "none")
   )

#table(BPdata$Awarded_by)

#create new variable for awards with description or without
BPdata2 %>%  
  select(Award_name, Awarding_journal, Awarding_society, Awarding_other) %>% 
  mutate(
    across(c(Awarding_journal, Awarding_society, Awarding_other), ~ if_else(. == "NA", FALSE, TRUE))
  ) -> BPdata2

#make upset plot using library(ggupset)
figure2A <- BPdata2 %>% 
  column_to_rownames(var = "Award_name") %>% 
  as_tibble(rownames = "Award_name") %>%
  gather(Body, Award, -Award_name) %>% 
  filter(Award) %>%
  select(-Award) %>%
  mutate(Body = recode(Body, 'Awarding_journal' = 'journal', 'Awarding_society' = 'society', 'Awarding_other' = 'other')) %>%
  group_by(Award_name) %>%
  summarize(Awarding_bodies = list(Body)) %>%
  ggplot(aes(x = Awarding_bodies)) +
    geom_bar(fill = "#a70042", width = 0.8) +
    scale_x_upset(order_by = c("freq")) +
    theme_combmatrix(combmatrix.panel.striped_background = FALSE,
                    combmatrix.panel.point.color.fill = "#a70042",
                    combmatrix.panel.line.color = "#a70042",
                    plot.title = element_text(family = "sans", size = 20, face = "plain", color = "black"),
                    axis.title.x = element_text(family="sans", size = 14, color = "black", face="plain"), 
                    axis.title.y = element_text(family="sans", size = 14, color = "black", face = "plain", vjust = -2),
                    ) +
    labs(title = "A", x = "Awarding bodies combination", y = "Award count")

Figure 2B - target awardee career stages

#table(BPdata$Career_stage) #mai options: student, early-career, mid-career, any career stage, unclear

#create new variable with separate career stage values as a list
BPdata2 %>%  
  mutate(
    Career_stage_list = str_split(BPdata$Career_stage, pattern = ", ")
  ) -> BPdata2

#BPdata$Career_stage_list <- str_split(BPdata$Career_stage, pattern = ", ") #sam as above but modifies original data frame

#make upset plot using library(ggupset)
figure2B <- BPdata2 %>% 
  ggplot(aes(x = Career_stage_list)) +
    geom_bar(fill = "#8B6969",width = 0.8) +
    scale_x_upset(order_by = c("freq")) +
    theme_combmatrix(combmatrix.panel.striped_background = FALSE,
                    combmatrix.panel.point.color.fill = "#8B6969",
                    combmatrix.panel.line.color = "#8B6969",
                    plot.title = element_text(family = "sans", size = 20, face = "plain", color = "black"),
                    axis.title.x = element_text(family="sans", size = 14, color = "black", face="plain"), 
                    axis.title.y = element_text(family="sans", size = 14, color = "black", face = "plain", vjust = -2),
                    ) + 
    labs(title = "B", x = "Target career stages combination", y = "")

Combine 2 panels and save

#assemble the panels using patchwork package
figure2 <- figure2A / figure2B + 
  plot_layout(ncol = 2, nrow = 1, widths = c(1, 1)) #+  plot_annotation(tag_levels = "A") #does not work with this lot type

#ggsave(plot = figure2, here("plots", "Fig2AB_award_descriptions_v0.png"), width = 18, height = 8, units = "cm", bg = "white", dpi = "retina", scale = 1.6)
#ggsave(plot = figure2, here("plots", "Fig2AB_award_descriptions_v0.pdf"), width = 18, height = 8, units = "cm", scale = 1.6)

2.6 Figure 3

#BPdata %>%
#    count(Open_science) %>%
#    ggplot(aes(x = Open_science, y = n, fill = Open_science)) + 
#    geom_col(width = 0.7) +
#    geom_text(position = position_stack(vjust = 0.5), aes(label = n)) + 
#    theme_classic() + 
#    coord_flip() +
#    labs(x = "Open Science valued", y = "Article count", fill = "Open_science") + 
#    theme(legend.position = "none", axis.title.x = element_text(size = 10))

Figure 3A - Summary plots of key award characteristics

BPdata %>% select(Award_individual, Flexible_eligibility, Inclusivity_statement, Assessors_transparency, Award_integrity_mentioned, Process_transparency, Self_nomination, Letter_required, Feedback_availability, Award_contact_provided) -> BPdata3

## reshape into long dataframe format:
BPdata3 %>% gather(key = var_name, value = value, 1:ncol(BPdata3)) -> BPdata3long
#str(BPdata3long)

#reorder var_name levels
BPdata3long <- BPdata3long %>% mutate(var_name = factor(var_name, levels = rev(c("Award_individual",
                                                                            "Flexible_eligibility", 
                                                                            "Inclusivity_statement", 
                                                                            "Assessors_transparency", 
                                                                            "Award_integrity_mentioned", 
                                                                            "Process_transparency",
                                                                            "Self_nomination", 
                                                                            "Letter_required", 
                                                                            "Feedback_availability", 
                                                                            "Award_contact_provided" 
                                                                            ))))


#reorder value levels:
BPdata3long <- BPdata3long %>% mutate(value = factor(value, levels = rev(c("no",
                                                                            "yes", 
                                                                            "not applicable", 
                                                                            "not available"))))
figure3A <- BPdata3long %>% 
  group_by(var_name) %>% 
  count(var_name, value) %>% 
  ggplot(aes(x = var_name, y = n, fill = value)) +
  #geom_bar(aes(fill = value), stat = "identity", position = position_dodge(0.9)) +
  geom_bar(aes(fill = value), stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = c("#E5E5E5", "#CCCCCC", "#C1FFC1",  "#FA8072")) +
  labs(x = "Award characteristics", y = "Award count", fill = "Award scores") + 
  scale_x_discrete(labels = rev(c("Award is focused on individual authors",
                            "Eligibility career timeline is flexible", 
                            "Inclusivity statement or encouragement is provided",
                            "Assessors are revealed",
                            "Award integrity mentioned", 
                            "Process is transparent",
                            "Self-nomination is allowed",
                            "Nomination letter is required",
                            "Feedback is availabile", 
                            "Award contact point is provided"
                            ))) +
  theme_classic()

Figure 3B - Summary plots of key award characteristics

BPdata %>% select(Criteria_transparency, Award_impact_metrics_mentioned, Award_impact_metrics_only, Open_science) -> BPdata3

## reshape into long dataframe format:
BPdata3 %>% gather(key = var_name, value = value, 1:ncol(BPdata3)) -> BPdata3long
#str(BPdata3long)

#reorder var_name levels
BPdata3long <- BPdata3long %>% mutate(var_name = factor(var_name, levels = rev(c( "Criteria_transparency", 
                                                                            "Award_impact_metrics_mentioned", 
                                                                            "Award_impact_metrics_only", 
                                                                            "Open_science"))))

#reorder value levels:
BPdata3long <- BPdata3long %>% mutate(value = factor(value, levels = rev(c("no",
                                                                            "yes", 
                                                                            "not available"))))
figure3B <- BPdata3long %>% 
  group_by(var_name) %>% 
  count(var_name, value) %>% 
  ggplot(aes(x = var_name, y = n, fill = value)) +
  #geom_bar(aes(fill = value), stat = "identity", position = position_dodge(0.9)) +
  geom_bar(aes(fill = value), stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = c("#E5E5E5", "#C1FFC1",  "#FA8072")) +
  labs(x = "Award criteria", y = "Award count", fill = "Award scores") + 
  scale_x_discrete(labels = rev(c("Assessment criteria are detailed out",
                            "Impact metrics are mentioned in criteria",
                            "Impact metrics are only criteria", 
                            "Open Science practices are valued"))) +
  theme_classic()

Combine 2 panels and save

#assemble the panels using patchwork package
figure3 <- figure3A / figure3B + 
  plot_layout(ncol = 1, nrow = 2, heights = c(2, 1)) + 
  plot_annotation(tag_levels = "A") 

#ggsave(plot = figure3, here("plots", "Fig3AB_award_descriptions_v0.png"), width = 18, height = 10, units = "cm", bg = "white", dpi = "retina", scale = 1.2)
#ggsave(plot = figure2, here("plots", "Fig3AB_award_descriptions_v0.pdf"), width = 18, height = 10, units = "cm", scale = 1.2)

2.7 Award characteristics by SA

Figure SX - Award_individual by SA

#table(BPdata$Award_individual) #as a table - 62 are individual(28%)
#table(BPdata$Scimago_Subject_Area, BPdata$Award_individual) #as a table - 9 disciplines have 0!

BPdata %>% 
    mutate(Award_individual = factor(Award_individual, levels = (c("no", "yes")))) %>% #reorder value levels
    count(Scimago_Subject_Area, Award_individual) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Award_individual)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#C1FFC1")) +
    labs(x = "Scimago Subject Area", y = "Award count", fill = "Award is focused on individual authors:") + 
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

Figure SX - Flexible_eligibility by SA

#table(BPdata$Flexible_eligibility, useNA = "always") #as a table
#table(BPdata$Scimago_Subject_Area, BPdata$Flexible_eligibility, useNA = "always") #as a table

BPdata %>% 
    mutate(Flexible_eligibility = factor(Flexible_eligibility, levels = c("no", "yes", "not applicable", "not available"))) %>%
    count(Scimago_Subject_Area, Flexible_eligibility) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Flexible_eligibility)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#C1FFC1", "#CCCCCC", "#E5E5E5")) +
    #scale_fill_manual(values = c("#FA8072", "#FA807210", "#F1FFC1", "#C1FFC1")) +
    labs(x = "Scimago Subject Area", y = "Award count", fill = "Eligibility career timeline is flexible:") + 
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

Figure SX - Inclusivity_statement by SA

#table(BPdata$Scimago_Subject_Area, BPdata$Inclusivity_statement) #as a table

BPdata %>% 
    mutate(Inclusivity_statement = factor(Inclusivity_statement, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
    count(Scimago_Subject_Area, Inclusivity_statement) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Inclusivity_statement)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
    labs(x = "Scimago Subject Area", y = "Award count", fill = "Inclusivity statement or encouragement is provided:") + 
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

Figure SX - Inclusivity_statement by SA

#table(BPdata$Scimago_Subject_Area, BPdata$Assessors_transparency) #as a table

BPdata %>% 
    mutate(Assessors_transparency = factor(Assessors_transparency, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
    count(Scimago_Subject_Area, Assessors_transparency) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Assessors_transparency)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
    labs(x = "Scimago Subject Area", y = "Award count", fill = "Assessors are revealed (names or as journal editors):") + 
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

Figure SX - Inclusivity_statement by SA

#table(BPdata$Scimago_Subject_Area, BPdata$Process_transparency) #as a table

BPdata %>% 
    mutate(Process_transparency = factor(Process_transparency, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
    count(Scimago_Subject_Area, Process_transparency) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Process_transparency)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
    labs(x = "Scimago Subject Area", y = "Award count", fill = "Process is transparent:") + 
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

Figure SX - Self_nomination by SA

#table(BPdata$Scimago_Subject_Area, BPdata$Self_nomination) #as a table

BPdata %>% 
    mutate(Self_nomination = factor(Self_nomination, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
    count(Scimago_Subject_Area, Self_nomination) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Self_nomination)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
    labs(x = "Scimago Subject Area", y = "Award count", fill = "Self-nomination is allowed:") + 
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

Figure SX - Letter_required by SA

#table(BPdata$Scimago_Subject_Area, BPdata$Letter_required) #as a table

BPdata %>% 
    mutate(Letter_required = factor(Letter_required, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
    count(Scimago_Subject_Area, Letter_required) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Letter_required)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
    labs(x = "Scimago Subject Area", y = "Award count", fill = "Nomination letter is required:") + 
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

Figure SX - Feedback_availability by SA

#table(BPdata$Scimago_Subject_Area, BPdata$Feedback_availability) #as a table

BPdata %>% 
    mutate(Feedback_availability = factor(Feedback_availability, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
    count(Scimago_Subject_Area, Feedback_availability) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Feedback_availability)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
    labs(x = "Scimago Subject Area", y = "Award count", fill = "Feedback is availabile:") + 
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

Figure SX - Award_contact_provided by SA

#table(BPdata$Scimago_Subject_Area, BPdata$Award_contact_provided) #as a table

BPdata %>% 
    mutate(Award_contact_provided = factor(Award_contact_provided, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
    count(Scimago_Subject_Area, Award_contact_provided) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Award_contact_provided)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
    labs(x = "Scimago Subject Area", y = "Award count", fill = "Award contact point is provided:") + 
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

Figure SX - Criteria_transparency by SA

#table(BPdata$Scimago_Subject_Area, BPdata$Criteria_transparency) #as a table

BPdata %>% 
    mutate(Criteria_transparency = factor(Criteria_transparency, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
    count(Scimago_Subject_Area, Criteria_transparency) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Criteria_transparency)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
    labs(x = "Scimago Subject Area", y = "Award count", fill = "Assessment criteria are detailed out:") + 
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

Figure SX - Award_impact_metrics_mentioned by SA

#table(BPdata$Scimago_Subject_Area, BPdata$Award_impact_metrics_mentioned) #as a table

BPdata %>% 
    mutate(Award_impact_metrics_mentioned = factor(Award_impact_metrics_mentioned, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
    count(Scimago_Subject_Area, Award_impact_metrics_mentioned) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Award_impact_metrics_mentioned)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
    labs(x = "Scimago Subject Area", y = "Award count", fill = "Impact metrics are mentioned in criteria:") + 
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

Figure SX - Award_impact_metrics_only by SA

#table(BPdata$Scimago_Subject_Area, BPdata$Award_impact_metrics_only) #as a table

BPdata %>% 
    mutate(Award_impact_metrics_only = factor(Award_impact_metrics_only, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
    count(Scimago_Subject_Area, Award_impact_metrics_only) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Award_impact_metrics_only)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
    labs(x = "Scimago Subject Area", y = "Award count", fill = "Impact metrics are only criteria:") + 
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

Figure SX - Open_science by SA

#table(BPdata$Scimago_Subject_Area, BPdata$Open_science) #as a table

BPdata %>% 
    mutate(Open_science = factor(Open_science, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
    count(Scimago_Subject_Area, Open_science) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Open_science)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
    labs(x = "Scimago Subject Area", y = "Award count", fill = "Open Science practices are valued:") + 
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

2.8 Text mining of award descriptions

Award descriptions - wordcloud of frequencies all words.

#Using packages tidytext and stopwords
Award_description_txt <- tibble(txt = tolower(BPdata$Award_description))
Award_description_txt <- Award_description_txt %>% unnest_tokens(output = word, input = txt, token = "words", to_lower = TRUE) #restructure all descriptions as one-token-per-row format
Award_description_txt <- Award_description_txt %>% anti_join(get_stopwords()) #remove stop words from library(stopwords) 
## Joining with `by = join_by(word)`
Award_description_txt <- Award_description_txt %>% anti_join(get_stopwords()) #remove stop words from library(stopwords) 
## Joining with `by = join_by(word)`
Award_description_txt$word <- tokenize_word_stems(Award_description_txt$word) #make all word stems lowercase
word.freq <- Award_description_txt %>% count(word, sort = TRUE) #count words

## plot wordcloud using library(wordcloud) for top 100 words
#word.freq %>% with(wordcloud(word, n, max.words = 100, random.order = FALSE, rot.per = 0.35, colors = brewer.pal(12, "Paired")))

Award descriptions - count mentions of specific words (stemmed):

#create list of specific words (stemmed) to count within strings
specific.words <- c("best", "outstand", "impact", "original", "significant", "merit", "excell", "innovat", "novel", "creativ", "transparen", "reliab", "robust", "reproduc", "replica", "rigor", "rigour", "trust")

#prepare award descriptions as a single lowercase string
descriptions <- BPdata %>% 
  filter(!is.na(Award_description)) %>% 
  select(Award_description) %>% 
  tolower() #single lowercase string

#sum of all mentions for each word
specific.words.mentions <- specific.words %>%
  map_int(~ str_count(tolower(descriptions), .x))

#prepare award descriptions while keeping them separate for each award
descriptions2 <- tolower(BPdata$Award_description) #vector of lowercase strings

#sum of mentions per award for each word (counts only one mention per award)
specific.words.mentions2 <- specific.words %>%
  map_int(~ sum(str_detect(descriptions2, .x), na.rm = TRUE))

# ## doing the same as above, but manually:
# #count all mentions of words(parts) individually, e.g.:
# sum(str_count(tolower(BPdata$Award_description), "best"), na.rm = TRUE) #169
# #counting once per award, e.g. 
# sum(str_detect(BPdata$Award_description, "best"), na.rm = TRUE) #82

2.9 Figure 4

Plot frequencies of specific words - all mentions in award descriptions.

words.df <- tibble(Words = specific.words, 
                            Count_all = specific.words.mentions,
                            Count_once = specific.words.mentions2)

figure4A <- words.df %>%
    ggplot(aes(x = reorder(Words, Count_all), y = Count_all)) + 
    geom_col(width = 0.8, fill = "#838B83") +
    coord_flip() +
    scale_y_continuous(breaks = c(0, 50, 100, 150), limits = c(0, 200)) +
    theme_bw() + 
    labs(x = "Word stem", y = "Count of all mentions") + 
    annotate(geom = "rect",xmin = 8.5, xmax = Inf, ymin = -Inf, ymax = Inf, fill = "#A52A2A", alpha = 0.1) +
    annotate(geom = "rect",xmin = -Inf, xmax = 8.5, ymin = -Inf, ymax = Inf, fill = "#00FF00", alpha = 0.1) +
    theme(legend.position = "none", axis.title.x = element_text(size = 10))

Plot frequencies of specific words - first mentions in award descriptions.

figure4B <- words.df %>%
    ggplot(aes(x = reorder(Words, Count_all), y = Count_once)) + 
    geom_col(width = 0.8, fill = "#C1CDC1") +
    coord_flip() +
    scale_y_continuous(breaks = c(0, 50, 100, 150), limits = c(0, 200)) +
    theme_bw() + 
    labs(x = "Word stem", y = "Count of first mention per award") + 
    scale_x_discrete(labels = NULL) + labs(x = "")  + #used to remove vertical labels, also breaks = NULL
    annotate(geom = "rect",xmin = 8.5, xmax = Inf, ymin = -Inf, ymax = Inf, fill = "#A52A2A", alpha = 0.1) +
    annotate(geom = "rect",xmin = -Inf, xmax = 8.5, ymin = -Inf, ymax = Inf, fill = "#00FF00", alpha = 0.1) +
    theme(legend.position = "none", axis.title.x = element_text(size = 10))

Combine 2 panels and save:

#assemble the panels using patchwork package
figure4 <- figure4A / figure4B + 
  plot_layout(ncol = 2, nrow = 1) +
  plot_annotation(tag_levels = "A")
#ggsave(plot = figure4, here("plots", "Fig4AB_words_counts_v0.png"), width = 18, height = 8, units = "cm", dpi = "retina", scale = 1.2)
#ggsave(plot = figure4, here("plots", "Fig4AB_words_counts_v0.pdf"), width = 18, height = 8, units = "cm", scale = 1.2)

2.10 Characteristics of the past winners

How many awards extracted overall?

length(unique(BPdata$Award_name[BPdata$N_winners_extracted != 0])) #62 - number from the records in the award-level data set
## [1] 61
length(unique(BPindiv$award_name)) #62 - number from records in past winner's data
## [1] 62

How many awards extracted per SA?

#from BPdata:
BPdata %>% 
  group_by(Scimago_Subject_Area) %>% 
  filter(N_winners_extracted != 0) %>% 
  count(Scimago_Subject_Area) -> awards_perSA

# #from BPindiv:
# BPindiv %>%  
#   group_by(award_SA) %>%
#   summarise(count = n_distinct(award_name))

#plot counts where >0 awards per SA
p_count.awards.SA <- BPindiv %>% 
  group_by(award_SA) %>%
  summarise(count = n_distinct(award_name)) %>% 
  ggplot(aes(x = reorder(award_SA, desc(award_SA)), y = count)) +
  geom_bar(stat = "identity", position = position_dodge(0.9)) +
  coord_flip() +
  theme_bw() +
  labs(x = "Scimago Subject Area", y = "Award count")  

How many records (names) extracted overall?

#sum(BPdata$N_winners_extracted, na.rm = TRUE) #number from the records in the award-level data set
#dim(BPindiv)[1] #number of records in past winner's data

How many records (names) extracted per SA?

#from BPdata:
BPdata %>% 
  group_by(Scimago_Subject_Area) %>% 
  select(N_winners_extracted) %>% 
  filter(!is.na(N_winners_extracted)) %>% 
  summarise(sum_winners = sum(N_winners_extracted), .groups = 'drop') -> winners_perSA1
## Adding missing grouping variables: `Scimago_Subject_Area`
winners_perSA1[winners_perSA1$sum_winners == 0, ] #8 SA without any extracted names
## # A tibble: 8 × 2
##   Scimago_Subject_Area                sum_winners
##   <chr>                                     <dbl>
## 1 Business, Management and Accounting           0
## 2 Decision Sciences                             0
## 3 Economics, Econometrics and Finance           0
## 4 Engineering                                   0
## 5 Health Professions                            0
## 6 Materials Science                             0
## 7 Mathematics                                   0
## 8 Social Sciences                               0
dim(winners_perSA1[winners_perSA1$sum_winners != 0, ])[1] #number of SA with any extracted names
## [1] 19
winners_perSA1[winners_perSA1$sum_winners != 0, ] #table of SA with any extracted names
## # A tibble: 19 × 2
##    Scimago_Subject_Area                         sum_winners
##    <chr>                                              <dbl>
##  1 Agricultural and Biological Sciences                  69
##  2 Arts and Humanities                                   17
##  3 Biochemistry, Genetics and Molecular Biology         152
##  4 Chemical Engineering                                  33
##  5 Chemistry                                             95
##  6 Computer Science                                      45
##  7 Dentistry                                             14
##  8 Earth and Planetary Sciences                          15
##  9 Energy                                                21
## 10 Environmental Science                                 19
## 11 Immunology and Microbiology                           16
## 12 Medicine                                              37
## 13 Multidisciplinary                                     40
## 14 Neuroscience                                         134
## 15 Nursing                                               63
## 16 Pharmacology, Toxicology and Pharmaceutics            42
## 17 Physics and Astronomy                                122
## 18 Psychology                                            81
## 19 Veterinary                                            64
#from BPindiv:
#table(BPindiv$award_SA)
dim(table(BPindiv$award_SA, useNA = "always")) #19 SA with any extracted names
## [1] 20
BPindiv %>%
  group_by(award_SA) %>% count() -> winners_perSA1_extracted #does not show SA with 0 extracted

#plot counts where >0 winners per SA
p_count.awardees.SA <- BPindiv %>% 
  group_by(award_SA) %>% count() %>%
  ggplot(aes(x = reorder(award_SA, desc(award_SA)), y = n)) +
  geom_bar(stat = "identity", position = position_dodge(0.9)) +
  coord_flip() +
  theme_bw() +
  #scale_x_discrete(labels = NULL) + labs(x = "")  + #used to remove vertical labels, also breaks = NULL
  labs(x = "Scimago Subject Area", y = "Awardee count")  

# View(left_join(x = winners_perSA1, y = winners_perSA1_extracted, by = c("Scimago_Subject_Area" = "award_SA"))) #check if matching numbers

Combine plots of total counts of awards and winners per SA

#assemble the panels using patchwork package
count.awards.awardees_SA <- p_count.awards.SA / p_count.awardees.SA + 
  plot_layout(ncol = 2, nrow = 1) +
  plot_annotation(tag_levels = "A")

#ggsave(here("plots", "count.awards.awardees_SA_v1.png"), width = 18, height = 8, units = "cm", dpi = "retina")
#ggsave(here("plots", "count.awards.awardees_SA_v1.pdf"), width = 18, height = 8, units = "cm")

How many records per decade?

table(BPindiv$award_year, useNA = "always")
## 
## 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 
##    9   11   11   13   13   20   16   20   21   31   41   45   56   58   74   78 
## 2017 2018 2019 2020 2021 2022 <NA> 
##   83   92   93  102  107   89    0
#create decade variable 2001-2010, 2011-2020, 2021-2022
BPindiv %>%  mutate(Decade = case_when(
    award_year >= 2001 & award_year <= 2010 ~ "2001-2010",
    award_year >= 2011 & award_year <= 2020 ~ "2011-2020",
    award_year >= 2021 & award_year <= 2022 ~ "2021-2022",
    award_year == "Veterinary" ~ "Biology",
    TRUE ~ NA)
    ) -> BPindiv

table(BPindiv$Decade, useNA = "always")
## 
## 2001-2010 2011-2020 2021-2022      <NA> 
##       165       722       196         0

How many records per decade and SA?

table(BPindiv$award_SA, BPindiv$Decade)
##                                               
##                                                2001-2010 2011-2020 2021-2022
##   Agricultural and Biological Sciences                24        36         9
##   Arts and Humanities                                  1        14         2
##   Biochemistry, Genetics and Molecular Biology        37        84        31
##   Chemical Engineering                                 0        16        17
##   Chemistry                                            0        78        17
##   Computer Science                                     1        32        12
##   Dentistry                                            3        10         1
##   Earth and Planetary Sciences                         0        11         4
##   Energy                                               3        16         2
##   Environmental Science                                1        15         3
##   Immunology and Microbiology                          0        12         4
##   Medicine                                             4        21        12
##   Multidisciplinary                                   10        23         7
##   Neuroscience                                        32        83        23
##   Nursing                                             20        39         4
##   Pharmacology, Toxicology and Pharmaceutics          12        24         6
##   Physics and Astronomy                                0       102        20
##   Psychology                                           1        64        16
##   Veterinary                                          16        42         6

How many individuals shared awards?

table(BPindiv$shared, useNA = "always") #157 shared (14% of names), usually shared by 2 authors)
## 
##   no  yes <NA> 
##  926  157    0
table(BPindiv$Decade, BPindiv$shared) #more common recently?
##            
##              no yes
##   2001-2010 148  17
##   2011-2020 630  92
##   2021-2022 148  48
table(BPindiv$award_SA, BPindiv$shared) # sharing most frequent in Computer Science
##                                               
##                                                 no yes
##   Agricultural and Biological Sciences          65   4
##   Arts and Humanities                           17   0
##   Biochemistry, Genetics and Molecular Biology 126  26
##   Chemical Engineering                          28   5
##   Chemistry                                     84  11
##   Computer Science                              10  35
##   Dentistry                                     14   0
##   Earth and Planetary Sciences                  12   3
##   Energy                                         9  12
##   Environmental Science                         17   2
##   Immunology and Microbiology                   14   2
##   Medicine                                      35   2
##   Multidisciplinary                             32   8
##   Neuroscience                                 115  23
##   Nursing                                       61   2
##   Pharmacology, Toxicology and Pharmaceutics    26  16
##   Physics and Astronomy                        120   2
##   Psychology                                    79   2
##   Veterinary                                    62   2

How many awards had article information?

NOTE: during first round of data extraction, we located around 3/4 of winning articles - these were articles where title or DOI was available on award page or announcement. However, article information was sometimes only publicly listed for recently awarded articles. Additional Internet searches increased the number of identified articles to 795.

table(is.na(BPindiv$awarded_paper_ref)) #200 missing (out of 1083 = 18%)
## 
## FALSE  TRUE 
##   883   200
table(is.na(BPindiv$awarded_paper_doi)) #201 missing (out of 108s = 18%)
## 
## FALSE  TRUE 
##   882   201

How many have gender assigned and how?

table(BPindiv$awardee_gender, useNA = "always") #0 missing or unclear
## 
##    F    M <NA> 
##  426  657    0
round(table(BPindiv$awardee_gender, useNA = "always")/length(BPindiv$awardee_gender) * 100, 1) #as percentages
## 
##    F    M <NA> 
## 39.3 60.7  0.0
table(BPindiv$gender_source, useNA = "always") #0 missing - change to "not found"
## 
##     name    photo pronouns     <NA> 
##      653      262      168        0
round(table(BPindiv$gender_source, useNA = "always")/length(BPindiv$awardee_gender) * 100, 1) #as percentages
## 
##     name    photo pronouns     <NA> 
##     60.3     24.2     15.5      0.0

2.10.1 Awardee gender

Gender counts overall

BPindiv %>%  
   group_by(award_SA) %>%
   count(awardee_gender)
## # A tibble: 38 × 3
## # Groups:   award_SA [19]
##    award_SA                                     awardee_gender     n
##    <chr>                                        <chr>          <int>
##  1 Agricultural and Biological Sciences         F                 33
##  2 Agricultural and Biological Sciences         M                 36
##  3 Arts and Humanities                          F                  4
##  4 Arts and Humanities                          M                 13
##  5 Biochemistry, Genetics and Molecular Biology F                 62
##  6 Biochemistry, Genetics and Molecular Biology M                 90
##  7 Chemical Engineering                         F                 12
##  8 Chemical Engineering                         M                 21
##  9 Chemistry                                    F                 41
## 10 Chemistry                                    M                 54
## # ℹ 28 more rows
BPindiv %>% 
  group_by(Decade) %>%
  count(awardee_gender)
## # A tibble: 6 × 3
## # Groups:   Decade [3]
##   Decade    awardee_gender     n
##   <chr>     <chr>          <int>
## 1 2001-2010 F                 65
## 2 2001-2010 M                100
## 3 2011-2020 F                284
## 4 2011-2020 M                438
## 5 2021-2022 F                 77
## 6 2021-2022 M                119

Gender plot overall

## set the levels in order we want:
BPindiv$Gender <- factor(BPindiv$awardee_gender)
levels(BPindiv$Gender) <- c("female", "male")

## plot for a all SA and years - not stacked, with count values:
BPindiv %>%  
#  group_by(award_SA) %>%
  count(Gender) %>%
  ggplot(aes(x = Gender, y = n)) +
  geom_bar(aes(fill = Gender), stat = "identity", position = position_dodge(0.9)) +
  scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
  theme_bw() +
  theme(legend.position = "none", legend.box = "horizontal", axis.text = element_text(size = 10)) +
  labs(fill = "Awardee gender:") +
  labs(x = "Awardees", y = "Awardee count")  

Gender plots by Decade

## plot for a all awards by Decade - not stacked, with count values (40-41% in each decade):
BPindiv %>% 
  group_by(Decade) %>%
  count(Gender) %>%
  ggplot(aes(x = Decade, y = n, fill = Gender)) +
  geom_bar(aes(fill = Gender), stat = "identity", position = position_dodge(0.8), width = 0.8, col = "white") + # use , position = "fill" for proportion plot
#  coord_flip() + 
  scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
  theme_bw() +
  theme(legend.position="top", legend.box = "horizontal", axis.text = element_text(size = 10)) +
  labs(fill = "Awardee gender:") +
  labs(x = "Decade", y = "Awardee count")  

## plot for a all awards by Decade - horizontal, not stacked, with count values:
figure5A <- BPindiv %>% 
  group_by(Decade) %>%
  count(Gender) %>%
  ggplot(aes(y = Decade, x = n, fill = Gender)) +
  geom_bar(aes(fill = Gender), stat = "identity") + 
  scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
  theme_bw() +
  theme(legend.position="top", legend.text = element_text(size = 8), legend.title = element_text(size = 8),
        axis.text = element_text(size = 10), legend.box = "horizontal", legend.margin = margin()) +
  guides(fill = guide_legend(nrow = 1, byrow = TRUE, reverse = TRUE)) + 
  labs(fill = "Awardee gender:") +
  labs(y = "Decade", x = "Awardee counts")  

## plot for a all awards by Decade - horizontal, stacked, with proportion values:
figure5B <- BPindiv %>% 
  group_by(Decade) %>%
  count(Gender) %>%
  ggplot(aes(x = Decade, y = n, fill = Gender)) +
  geom_bar(aes(fill = Gender), stat = "identity", position = "fill") + # use , position = "fill" for proportion plot
  coord_flip() + 
  scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
  theme_bw() +
  theme(legend.position = "none", legend.box = "horizontal", axis.text = element_text(size = 10)) +
  labs(fill = "Awardee gender:") +
  labs(x = "Decade", y = "Awardee proportion")  

Gender plot by SA

## plot for a all awards by SA - not stacked, with count values:
BPindiv %>% 
  group_by(award_SA) %>%
  count(Gender) %>%
  ggplot(aes(x = reorder(award_SA, desc(award_SA)), y = n, fill = Gender)) +
  geom_bar(aes(fill = Gender), stat = "identity", position = "fill") + # use , position = "fill" for proportion plot
  coord_flip() + 
  scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
  theme_bw() +
  theme(legend.position="top", legend.box = "horizontal", axis.text = element_text(size = 10)) +
  labs(fill = "Awardee gender:") +
  labs(x = "Scimago Subject Area", y = "Awardee proportion")  

Gender plot by SA and Decade

## plot for a all awards by SA and Decade - not stacked, with count values:
BPindiv %>% 
  group_by(award_SA, Decade) %>%
  count(Gender) %>%
  ggplot(aes(x = reorder(award_SA, desc(award_SA)), y = n, fill = Gender)) +
  geom_bar(aes(fill = Gender), stat = "identity", position = position_dodge(0.8), width = 0.8, col = "white") + # use , position = "fill" for proportion plot
  facet_wrap(~Decade, scales = "fixed", nrow = 1, ncol = 3) +
  coord_flip() + 
  scale_fill_manual(values = c("#E69F00", "#56B4E9")) +
  theme_bw() +
  theme(legend.position="top", legend.box = "horizontal", axis.text = element_text(size = 10)) +
  labs(fill = "Awardee gender:") +
  labs(x = "Scimago Subject Area", y = "Awardee count")  

2.10.2 Affiliation country

How many had affiliation info and from where (affiliation_info_source)?

length(unique(BPindiv$affiliation_country)) #44 unique country names
## [1] 44
table(is.na(BPindiv$affiliation_country)) #45 values missing (out of 1083 = 4%)
## 
## FALSE  TRUE 
##  1029    54
table(BPindiv$affiliation_info_source, useNA = "always") #affiliations sources
## 
##            article award announcement         award page              other 
##                487                159                355                 29 
##               <NA> 
##                 53
round(table(BPindiv$affiliation_info_source, useNA = "always")/1058*100,1) #as percentages
## 
##            article award announcement         award page              other 
##               46.0               15.0               33.6                2.7 
##               <NA> 
##                5.0
#count number of unique countries per decade:
BPindiv %>% 
  filter(!is.na(affiliation_country)) %>% 
  group_by(Decade) %>% 
  summarise(count = n_distinct(affiliation_country))
## # A tibble: 3 × 2
##   Decade    count
##   <chr>     <int>
## 1 2001-2010    26
## 2 2011-2020    39
## 3 2021-2022    29
#count number of unique countries per year:
BPindiv %>% 
  filter(!is.na(affiliation_country)) %>% 
  group_by(award_year) %>% 
  summarise(count = n_distinct(affiliation_country))
## # A tibble: 22 × 2
##    award_year count
##         <dbl> <int>
##  1       2001     3
##  2       2002     5
##  3       2003     5
##  4       2004     6
##  5       2005     4
##  6       2006     7
##  7       2007     6
##  8       2008     8
##  9       2009     7
## 10       2010    12
## # ℹ 12 more rows
#count countries:
BPindiv %>% 
  filter(!is.na(affiliation_country)) %>% 
  count(affiliation_country) %>% 
  arrange(desc(n))
## # A tibble: 43 × 2
##    affiliation_country     n
##    <chr>               <int>
##  1 USA                   491
##  2 UK                     70
##  3 China                  60
##  4 Germany                51
##  5 Canada                 40
##  6 Australia              34
##  7 Japan                  23
##  8 Netherlands            22
##  9 New Zealand            22
## 10 France                 20
## # ℹ 33 more rows
# #count countries - view top 20:
# BPindiv %>% 
#   filter(!is.na(affiliation_country)) %>% 
#   count(affiliation_country) %>% 
#   arrange(desc(n)) %>%
#   filter(n>5) %>%
#   View()

#recode top 10 countries + other
BPindiv$Country10 <- recode(BPindiv$affiliation_country, 
                            "USA" = "USA", 
                            "UK" = "UK", 
                            "Australia" = "Australia", 
                            "China" = "China", 
                            "Germany" = "Germany", 
                            "Canada" = "Canada", 
                            "Japan" = "Japan", 
                            "Netherlands" = "Netherlands", 
                            "Italy" = "Italy", 
                            "France" = "France", 
                            .default = "other")
#table(BPindiv$Country10)

#recode countries with n>5 (20)
#BPindiv$Country20 <- recode(BPindiv$affiliation_country, 
                            # "USA" = "USA", 
                            # "UK" = "UK", 
                            # "Australia" = "Australia", 
                            # "China" = "China", 
                            # "Germany" = "Germany", 
                            # "Canada" = "Canada", 
                            # "Japan" = "Japan", 
                            # "Netherlands" = "Netherlands", 
                            # "Italy" = "Italy", 
                            # "France" = "France", 
                            # "Austria" = "Austria", 
                            # "Spain" = "Spain", 
                            # "Sweden" = "Sweden", 
                            # "Switzerland" = "Switzerland", 
                            # "India" = "India", 
                            # "South Korea" = "South Korea", 
                            # "Brazil" = "Brazil", 
                            # "Singapore" = "Singapore", 
                            # "Taiwan" = "Taiwan",  
                            # "Belgium" = "Belgium",
                            # .default = "other")
#table(BPindiv$Country20)

Country affiliation plots overall

#all countries as a simple barplot
BPindiv %>% 
  filter(!is.na(affiliation_country)) %>% 
  count(affiliation_country) %>%
  arrange((n)) %>%
  ggplot(aes(x = reorder(affiliation_country, n), y = n)) +
  geom_bar(stat = "identity", position = position_dodge(0.9)) +
  coord_flip()

#top 10 countries - two-colour barplot
BPindiv %>% 
  filter(!is.na(Country10)) %>% 
  count(Country10) %>%
  arrange((n)) %>%
  ggplot(aes(x = reorder(Country10, n), y = n)) +
  geom_bar(aes(fill= Country10 == "other"), stat = "identity", position = position_dodge(0.9)) +
  scale_fill_manual(values = c("#DCDCDCFF", "#000000")) +
  coord_flip() + 
  theme_bw() +
  theme(legend.position="none", axis.text = element_text(size = 10)) +
  labs(x = "Affiliation country", y = "Awardee count")  

Country affiliation plot by SA

#top 10 countries by SA
BPindiv %>% 
  filter(!is.na(Country10)) %>% 
  count(award_SA, Country10) %>% 
  ggplot(aes(y = reorder(award_SA, desc(award_SA)), x = n, fill = Country)) +
  #geom_bar(stat = "identity")
  geom_bar(aes(fill = Country10), stat = "identity") +
  scale_fill_manual(values = rev(c("#F0E442", "#56B4E9", "#000000", "#009E73", "#9D7660FF", "#0072B2", "#D55E00", "#FCC66DFF", "#FFBED1FF", "#C3CE3DFF", "#C8133BFF"))) +
  theme_bw() + 
  theme(legend.position="top", axis.text = element_text(size = 10), legend.box = "horizontal", legend.margin = margin()) +
  labs(fill = "Awardee affiliation country:") +
  labs(x = "Scimago Subject Area", y = "Awardee count")  

#ggsave(here("plots", "awardee_country_SA_v1.png"), width = 18, height = 8, units = "cm", dpi = "retina")
#ggsave(here("plots", "awardee_country_SA_v1.pdf"), width = 18, height = 8, units = "cm")

Country affiliation plot by decade

#top 10 countries by Decade - horizontal, stacked, with count values
figure5C <- BPindiv %>% 
  filter(!is.na(Country10)) %>% 
  count(Decade, Country10) %>% 
  ggplot(aes(y = Decade, x = n, fill = Country10)) +
  geom_bar(aes(fill = Country10), stat = "identity") + # use , position = "fill" for proportion plot
  scale_fill_manual(values = rev(c("#F0E442", "#56B4E9", "#000000", "#009E73", "#9D7660FF", "#0072B2", "#D55E00", "#FCC66DFF", "#FFBED1FF", "#C3CE3DFF", "#C8133BFF"))) +
  theme_bw() +
  theme(legend.position="top", legend.text = element_text(size = 8), legend.title = element_text(size = 8),
        axis.text = element_text(size = 10), legend.box = "horizontal", legend.margin = margin()) +
  guides(fill = guide_legend(nrow = 2, byrow = TRUE, reverse = TRUE)) + 
  labs(fill = "Awardee affiliation country:") +
  labs(y = "Decade", x = "Awardee counts")  

#top 10 countries by Decade - horizontal, stacked, with proportion values
figure5D <- BPindiv %>% 
  filter(!is.na(Country10)) %>% 
  count(Decade, Country10) %>% 
  ggplot(aes(y = Decade, x = n, fill = Country10)) +
  geom_bar(aes(fill = Country10), stat = "identity", position = "fill") + # use , position = "fill" for proportion plot
  scale_fill_manual(values = rev(c("#F0E442", "#56B4E9", "#000000", "#009E73", "#9D7660FF", "#0072B2", "#D55E00", "#FCC66DFF", "#FFBED1FF", "#C3CE3DFF", "#C8133BFF"))) +
  theme_bw() +
  theme(legend.position = "none", axis.text = element_text(size = 10), legend.box = "horizontal", legend.margin = margin()) +
  labs(fill = "Awardee affiliation country:") +
  labs(y = "Decade", x = "Awardee proportion")  

Country affiliation plot by decade and by SA

#top 10 countries by SA and decade
BPindiv %>% 
  filter(!is.na(Country10)) %>% 
  count(award_SA, Decade, Country10) %>% 
  ggplot(aes(y = reorder(award_SA, desc(award_SA)), x = n, fill = Country10)) +
  #geom_bar(stat = "identity")
  geom_bar(aes(fill = Country10), stat = "identity") + # use , position = "fill" for proportion plot
  scale_fill_manual(values = rev(c("#F0E442", "#56B4E9", "#000000", "#009E73", "#9D7660FF", "#0072B2", "#D55E00", "#FCC66DFF", "#FFBED1FF", "#C3CE3DFF", "#C8133BFF"))) +
    facet_wrap(~Decade, scales = "fixed", nrow = 1, ncol = 3) +
  theme_bw() + 
  theme(legend.position="bottom", axis.text = element_text(size = 10), legend.box = "horizontal", legend.margin = margin()) +
  labs(fill = "Awardee affiliation country:") +
  labs(x = "Awardee count", y = "Scimago Subject Area")  

#ggsave(here("plots", "awardee_country_SA_Decade_v1.png"), width = 18, height = 8, units = "cm", dpi = "retina")
#ggsave(here("plots", "awardee_country_SA_Decade_v1.pdf"), width = 18, height = 8, units = "cm")

Combine 4 panels and save:

#assemble the panels using patchwork package
figure5 <- figure5A / figure5B / figure5C / figure5D + 
  plot_layout(ncol = 1, nrow = 4) +
  plot_annotation(tag_levels = "A")
#ggsave(plot = figure5, here("plots", "Fig5ABCD_gender_countries_v0.png"), width = 18, height = 14, units = "cm", dpi = "retina", scale = 1.2)
#ggsave(plot = figure5, here("plots", "Fig5ABCD_gender_countries_v0.pdf"), width = 18, height = 14, units = "cm", scale = 1.2)

2.10.3 Country productivity context

COprod <- read_csv(here("data", "scimagojr country rank 2021.csv"), skip = 0) #load data
## Rows: 235 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Country, Region
## dbl (7): Rank, Documents, Citable documents, Citations, Self-citations, Cita...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#top10 contries
COprod$Country %in% unique(BPindiv$Country10) %>% sum() #check overlap - 8
## [1] 8
setdiff(unique(BPindiv$Country10), COprod$Country) #missing: "United States" and "United Kingdom"
## [1] "other" "USA"   "UK"    NA
COprod$Country <-  gsub("United States", "USA", COprod$Country)  #replace with matching name
COprod$Country <-  gsub("United Kingdom", "UK", COprod$Country)  #replace with a matching name
#all countries
COprod$Country %in% unique(BPindiv$affiliation_country) %>% sum() #check overlap - 40 
## [1] 40
setdiff(unique(BPindiv$affiliation_country), COprod$Country)  #missing:  "Vietnam"  "UAE"     "Russia" 
## [1] "Vietnam" NA        "UAE"     "Russia"
COprod$Country <-  gsub("Viet Nam", "Vietnam", COprod$Country)  #replace with matching name
COprod$Country <-  gsub("United Arab Emirates", "UAE", COprod$Country)  #replace with matching name
COprod$Country <-  gsub("Russian Federation", "Russia", COprod$Country)  #replace with a matching name

## Plot for all countries
#recode top 10 countries + other
COprod$Country_affil <- recode(COprod$Country, 
                            "USA" = "USA", 
                            "UK" = "UK", 
                            "Australia" = "Australia", 
                            "China" = "China", 
                            "Germany" = "Germany", 
                            "Canada" = "Canada", 
                            "Japan" = "Japan", 
                            "Netherlands" = "Netherlands", 
                            "Italy" = "Italy", 
                            "France" = "France", 
                            .default = "other")
#table(COprod_affil$Country_affil)
names(COprod)
##  [1] "Rank"                   "Country"                "Region"                
##  [4] "Documents"              "Citable documents"      "Citations"             
##  [7] "Self-citations"         "Citations per document" "H index"               
## [10] "Country_affil"
## Plot productivity of top 10 countries and other - color productivity (Documents) likefor Countries that are in BPindiv data
COprod %>% 
  ggplot(aes(x = 1, y = Documents, fill = Country_affil)) + #reorder(Country_affil, desc(Documents))
  geom_bar(stat = "identity", position = "fill") +
  coord_flip() + 
  #geom_bar(aes(fill = Country_affil), stat = "identity", position = "fill") +
  scale_fill_manual(values = rev(c("#F0E442", "#56B4E9", "#000000", "#009E73", "#9D7660FF", "#0072B2", "#D55E00", "#FCC66DFF", "#FFBED1FF", "#C3CE3DFF", "#C8133BFF"))) +
  theme_bw() + 
  theme(legend.position="top", axis.text = element_text(size = 10), legend.box = "horizontal", legend.margin = margin()) +
  labs(x = "Countries", y = "Documents", fill = "Awardee affiliation country:") + 
  scale_x_discrete(labels = NULL)  #used to remove vertical labels, also breaks = NULL

# ## Plot just for countries matchng BPindiv$affiliation_country
# #reduce productivity dataframe to match affiliation_country:
# COprod %>% filter(Country %in% unique(BPindiv$affiliation_country)) -> COprod_affil
# 
# #recode top 10 countries + other
# COprod_affil$Country_affil <- recode(COprod_affil$Country, 
#                             "USA" = "USA", 
#                             "UK" = "UK", 
#                             "Australia" = "Australia", 
#                             "China" = "China", 
#                             "Germany" = "Germany", 
#                             "Canada" = "Canada", 
#                             "Japan" = "Japan", 
#                             "Netherlands" = "Netherlands", 
#                             "Italy" = "Italy", 
#                             "France" = "France", 
#                             .default = "other")
# #table(COprod_affil$Country_affil)
# names(COprod_affil)
# 
# #productivity of top 10 countries and other
# COprod_affil %>% 
#   ggplot(aes(x = 1, y = Documents, fill = Country_affil)) + #reorder(Country_affil, desc(Documents))
#   geom_bar(stat = "identity", position = "fill") +
#   coord_flip() + 
#   #geom_bar(aes(fill = Country_affil), stat = "identity", position = "fill") +
#   scale_fill_manual(values = rev(c("#F0E442", "#56B4E9", "#000000", "#009E73", "#9D7660FF", "#0072B2", "#D55E00", "#FCC66DFF", "#FFBED1FF", "#C3CE3DFF", "#C8133BFF"))) +
#   theme_bw() + 
#   theme(legend.position="top", axis.text = element_text(size = 10), legend.box = "horizontal", legend.margin = margin()) +
#   labs(x = "Countries", y = "Documents", fill = "Awardee affiliation country:") + 
#   scale_x_discrete(labels = NULL)  #used to remove vertical labels, also breaks = NULL

Figure SX.

2.10.4 First names

What are the most common first names of the past award winners?

BPindiv$first_names <- word(BPindiv$awardee_name, 1) #extract first word
#length(unique(BPindiv$first_names)) #815 unique first names, note some are initials

#count number of unique first names per decade:
BPindiv %>% 
  filter(!is.na(first_names)) %>% 
  #group_by(Decade) %>% 
  summarise(count = n_distinct(first_names))
## # A tibble: 1 × 1
##   count
##   <int>
## 1   815
# #count number of unique first names per year:
# BPindiv %>% 
#   filter(!is.na(first_names)) %>% 
#   group_by(award_year) %>% 
#   summarise(count = n_distinct(first_names)) %>% View()


#count first_names - view 16 names with >5 counts (16 names):
BPindiv %>% 
   #filter(!is.na(first_names)) %>% 
   count(first_names) %>% 
   arrange(desc(n)) %>%
   filter(n > 5) 
## # A tibble: 16 × 2
##    first_names     n
##    <chr>       <int>
##  1 Sarah          13
##  2 David          10
##  3 Christopher     8
##  4 Michael         8
##  5 Anne            7
##  6 Daniel          7
##  7 Jonathan        7
##  8 Richard         7
##  9 Adam            6
## 10 Andrew          6
## 11 Benjamin        6
## 12 Brian           6
## 13 Eric            6
## 14 James           6
## 15 Philipp         6
## 16 William         6
#plot first names that appear more than 5 times
BPindiv %>% 
  #filter(!is.na(first_names)) %>% 
  count(first_names) %>% 
  arrange(desc(n)) %>%
  filter(n > 5) %>%
  ggplot(aes(x = reorder(first_names, n), y = n)) +
  geom_bar(stat = "identity", position = position_dodge(0.9)) +
  coord_flip() +
  theme_bw() +
  labs(x = "Winners first name", y = "Count")   

2.10.5 Affiliation institutions

What are the most common affiliation institutions of the past award winners?

#length(unique(BPindiv$affiliation_institution)) #672 unique first names, note some require cleaning from extra information after ,
BPindiv$institution <- gsub(",.*", "", BPindiv$affiliation_institution) #remove everything after first comma
#length(unique(BPindiv$institution)) #581

# #count number of unique first names per decade:
# BPindiv %>% 
#   filter(!is.na(institution)) %>% 
#   group_by(Decade) %>% 
#   summarise(count = n_distinct(institution))

# #count number of unique first names per year:
# BPindiv %>% 
#   filter(!is.na(institution)) %>% 
#   group_by(award_year) %>% 
#   summarise(count = n_distinct(institution)) %>% View()

# #count institutions - view names with >5 counts (16 names):
# BPindiv %>% 
#    filter(!is.na(institution)) %>% 
#    count(institution) %>% 
#    arrange(desc(n)) %>%
#    filter(n > 5) 

#plot first names that appear more than 5 times
BPindiv %>% 
  filter(!is.na(institution)) %>% 
  count(institution) %>% 
  arrange(desc(n)) %>%
  filter(n > 5) %>%
  ggplot(aes(x = reorder(institution, n), y = n)) +
  geom_bar(stat = "identity", position = position_dodge(0.9)) +
  coord_flip() +
  theme_bw() +
  labs(x = "Winners affiliation institution", y = "Count")   

Figure SX.

2.11 Cash and perks for the winners

Award descriptions - cash award mentioned?

#table(BPdata$Award_cash) #103 no, 111 yes

figure6A <- BPdata %>% 
    mutate(Award_cash = factor(Award_cash, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
    mutate(Award_cash = recode(Award_cash, 'no' = 'no', 'yes' = 'yes', 'not available' = 'no description')) %>% #change level names
    count(Award_cash) %>%
    ggplot(aes(x = 1, y = n, fill = Award_cash)) + 
    geom_col(width = 1, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks = c(0,50,100, 150, 200, 250)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
    labs(x = "Cash", y = "Award count", fill = "Cash award mentioned:") + 
    scale_x_discrete(labels = NULL) + #used to remove vertical labels, also breaks = NULL
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

Award descriptions - cash award mentioned across SA:

#table(BPdata$Scimago_Subject_Area, BPdata$Award_cash) #as a table by SA

BPdata %>% 
    mutate(Award_cash = factor(Award_cash, levels = (c("no", "not available", "yes")))) %>% #reorder value levels
    mutate(Award_cash = recode(Award_cash, 'no' = 'no', 'yes' = 'yes', 'not available' = 'no description')) %>% #change level names
    count(Scimago_Subject_Area, Award_cash) %>%
    ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = n, fill = Award_cash)) + 
    geom_col(width = 0.8, position = position_stack(reverse = TRUE)) +
    coord_flip() +
    scale_y_continuous(breaks=c(0,5,10)) +
    theme_classic() + 
    scale_fill_manual(values = c("#FA8072", "#E5E5E5", "#C1FFC1")) +
    labs(x = "Scimago Subject Area", y = "Award count", fill = "Cash award mentioned:") + 
    theme(legend.position = "top", axis.title.x = element_text(size = 10))

Award descriptions - cash award amount?

#table(!is.na(BPdata$Award_cash_max_USD_pperson)) #97 values available
#table(!is.na(BPdata$Award_cash_max_USD_pperson), BPdata$Award_individual) #of 97 values available, 40 are for individual awards

#plot overall - histogram
figure6B <- BPdata %>% 
  filter(!is.na(Award_cash_max_USD_pperson)) %>%
  ggplot(aes(y = Award_cash_max_USD_pperson)) + 
  geom_histogram(binwidth = 1000, fill = "grey60", col = "white") +
  coord_flip() +
  theme_bw() + 
  labs(y = "Award cash max. USD/person", x = "Award count") + 
  scale_y_continuous(breaks = c(seq(0, 30000, by = 5000))) +
  theme(legend.position = "none", axis.title.x = element_text(size = 10))


#Plot by SA 
BPdata %>% 
  filter(!is.na(Award_cash_max_USD_pperson)) %>%
  ggplot(aes(x = reorder(Scimago_Subject_Area, desc(Scimago_Subject_Area)), y = Award_cash_max_USD_pperson)) + 
  geom_beeswarm(color = "grey30", alpha = 0.5) +
  coord_flip() +
  theme_bw() + 
  labs(y = "Award cash max. USD/person", x = "Scimago Subject Area") + 
  theme(legend.position = "none", axis.title.x = element_text(size = 10))

2.12 Text mining of Comment_on_the_award_cash

Comment_on_the_award_cash - wordcloud of frequencies all words.

#Using packages tidytext and stopwords
Comment_on_the_award_cash_txt <- tibble(txt = tolower(BPdata$Comment_on_the_award_cash))
Comment_on_the_award_cash_txt <- Comment_on_the_award_cash_txt %>% unnest_tokens(output = word, input = txt, token = "words", to_lower = TRUE) #restructure all descriptions as one-token-per-row format
Comment_on_the_award_cash_txt <- Comment_on_the_award_cash_txt %>% anti_join(get_stopwords()) #remove stop words from library(stopwords) 
## Joining with `by = join_by(word)`
Comment_on_the_award_cash_txt$word <- tokenize_word_stems(Comment_on_the_award_cash_txt$word) #make all word stems lowercase
word.freq <- Comment_on_the_award_cash_txt %>% count(word, sort = TRUE) #count words
word.freq$word <-  gsub("[[:digit:]]", "", word.freq$word)  #remove numbers
word.freq$word <- gsub("[[:punct:][:blank:]]+", " ", word.freq$word) #remove punctuation
word.freq <-  word.freq %>% drop_na() %>% filter(word != "")

## plot wordcloud using library(ggwordcloud) for top 100 words
set.seed(111)
figure6C <- word.freq %>%
  top_n(50) %>%
  ggplot() + 
  geom_text_wordcloud_area(aes(label = word, size = n), eccentricity = 0.2, shape = "square") +
  #scale_size_area(max_size = 10) +
  theme_minimal() +
  scale_color_gradient(low = "#CCCCCC", high = "#1A1A1A") +
  theme(plot.background = element_rect(fill = "#FFFFFF", colour = "#FFFFFF")) #Ppreventing transparent background in png
## Selecting by n
# figure6C <- word.freq %>% with(wordcloud(word, n, max.words = 100, random.order = FALSE, rot.per = 0, colors = brewer.pal(12, "Paired")))

2.13 Figure 6

Combine 3 panels and save:

#assemble the panels using patchwork package
figure6 <- figure6A / (figure6B + figure6C) +
  plot_layout(nrow = 2, heights = c(1, 4)) +
  plot_annotation(tag_levels = "A")
#ggsave(plot = figure6, here("plots", "Fig6ABC_perks_v0.png"), width = 18, height = 8, units = "cm", dpi = "retina", scale = 1.2)
#ggsave(plot = figure6, here("plots", "Fig6ABC_perks_v0.pdf"), width = 18, height = 8, units = "cm", scale = 1.2)

Count mentions of specific words (stemmed) in Comment_on_the_award_cash:

#create list of specific words (stemmed) to count within strings
specific.words2 <- c("certificate", "grant", "attend", "expens", "plaque", "talk", "invit", "registr", "meet", "conference", "member", "travel", "subscr", "board", "editor", "ticket", "article", "free", "ceremon", "congress")

#prepare award descriptions as a single lowercase string
descriptions2 <- BPdata %>% 
  filter(!is.na(Comment_on_the_award_cash)) %>% 
  select(Comment_on_the_award_cash) %>% 
  tolower() #single lowercase string

#sum of all mentions for each word
specific.words.mentions2 <- specific.words2 %>%
  map_int(~ str_count(tolower(descriptions2), .x))

#prepare award descriptions while keeping them separate for each award
descriptions3 <- tolower(BPdata$Comment_on_the_award_cash) #vector of lowercase strings

#sum of mentions per award for each word (counts only one mention per award)
specific.words.mentions3 <- specific.words2 %>%
  map_int(~ sum(str_detect(descriptions3, .x), na.rm = TRUE))

# ## doing the same as above, but manually:
# #count all mentions of words(parts) individually, e.g.:
# sum(str_count(tolower(BPdata$Comment_on_the_award_cash), "certificate"), na.rm = TRUE)
# #counting once per award, e.g. 
# sum(str_detect(BPdata$Comment_on_the_award_cash, "certificate"), na.rm = TRUE)

Plot frequencies of specific words - all mentions in cash descriptions.

words.df <- tibble(Words = specific.words2, 
                            Count_all = specific.words.mentions2,
                            Count_once = specific.words.mentions3)

words.df %>%
    ggplot(aes(x = reorder(Words, Count_all), y = Count_all)) + 
    geom_col(width = 0.8, fill = "#838B83") +
    coord_flip() +
    scale_y_continuous(breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55), limits = c(0, 55)) +
    theme_bw() + 
    labs(x = "Word stem", y = "Count of all mentions") + 
    theme(legend.position = "none", axis.title.x = element_text(size = 10))

Plot frequencies of specific words - first mentions in cash descriptions.

words.df %>%
    ggplot(aes(x = reorder(Words, Count_all), y = Count_once)) + 
    geom_col(width = 0.8, fill = "#C1CDC1") +
    coord_flip() +
    scale_y_continuous(breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55), limits = c(0, 55)) +
    theme_bw() + 
    labs(x = "Word stem", y = "Count of first mention per award") + 
    #scale_x_discrete(labels = NULL) + labs(x = "")  + #used to remove vertical labels, also breaks = NULL
    theme(legend.position = "none", axis.title.x = element_text(size = 10))

Individual winners data - presence of a personal profile?

#note; based on individual winners data

#table(BPindiv$awardee_profile_shown, useNA = "always")

#plot overall
figure7A <- BPindiv %>% 
  mutate(awardee_profile_shown = factor(awardee_profile_shown, levels = (c("yes", "no")))) %>% #reorder value levels
  group_by(award_year) %>%
  count(awardee_profile_shown) %>%
  ggplot(aes(x = award_year, y = n, fill = awardee_profile_shown)) + 
  geom_bar(aes(fill = awardee_profile_shown), stat = "identity") + # use , position = "fill" for proportion plot
  scale_fill_manual(values = c("#C1FFC1", "#FA8072")) +
  theme_bw() + 
  labs(x = "Year", y = "Winner count", fill = "Winner's profile shown") + 
  theme(legend.position = "top", axis.title.x = element_text(size = 10))

Individual winners data - presence of a personal photo?

#table(BPindiv$awardee_photo_shown, useNA = "always")

#plot overall
figure7B <- BPindiv %>% 
  mutate(awardee_photo_shown = factor(awardee_photo_shown, levels = (c("yes", "no")))) %>% #reorder value levels
  group_by(award_year) %>%
  count(awardee_photo_shown) %>%
  ggplot(aes(x = award_year, y = n, fill = awardee_photo_shown)) + 
  geom_bar(aes(fill = awardee_photo_shown), stat = "identity") + # use , position = "fill" for proportion plot
  scale_fill_manual(values = c("#C1FFC1", "#FA8072")) +
  theme_bw() + 
  labs(x = "Year", y = "Winner count", fill = "Winner's photo shown") + 
  theme(legend.position = "top", axis.title.x = element_text(size = 10))

Combine 2 panels and save:

#assemble the panels using patchwork package
figure7 <- figure7A / figure7B +
  plot_layout(ncol = 2, nrow = 1) +
  plot_annotation(tag_levels = "A")
#ggsave(plot = figure7, here("plots", "Fig7AB_perks_v0.png"), width = 18, height = 6, units = "cm", dpi = "retina", scale = 1.2)
#ggsave(plot = figure7, here("plots", "Fig7AB_perks_v0.pdf"), width = 18, height = 6, units = "cm", scale = 1.2)

2.13.1 Statistical models

2.14 Model individual winners gender across years

Statistical analyses of the gender difference in awardee gender across years - using individual-level data

#Scale the award_year variable to have the mean of 0 and SD of 1
BPindiv$Award_year_scaled <- scale(BPindiv$award_year)

#table(BPindiv$awardee_gender, useNA = "always")
BPindiv %>% 
  mutate(Gender = case_when(
    endsWith(awardee_gender, "F") ~ 1,
    endsWith(awardee_gender, "M") ~ 0
    )) -> BPindiv
#table(BPindiv$Gender)

#Fit generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):

#without year
model_gender <- glmer(Gender ~ 1+ (1|award_SA) + (1|award_name), family = "binomial", data = BPindiv) #without year as a predictor
summary(model_gender)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: Gender ~ 1 + (1 | award_SA) + (1 | award_name)
##    Data: BPindiv
## 
##      AIC      BIC   logLik deviance df.resid 
##   1429.7   1444.7   -711.9   1423.7     1080 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.4325 -0.7770 -0.6408  1.0866  2.0178 
## 
## Random effects:
##  Groups     Name        Variance Std.Dev.
##  award_name (Intercept) 0.2365   0.4863  
##  award_SA   (Intercept) 0.1482   0.3849  
## Number of obs: 1083, groups:  award_name, 62; award_SA, 19
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.5005     0.1430  -3.501 0.000464 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plogis(summary(model_gender)$coef[1,1])*100 #calculate % difference between genders at the intercept: 38% female names]]
## [1] 37.74133
#with year
model_gender <- glmer(Gender ~ Award_year_scaled + (1|award_SA) + (1|award_name), family = "binomial", data = BPindiv) #with year as a predictor
summary(model_gender)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: Gender ~ Award_year_scaled + (1 | award_SA) + (1 | award_name)
##    Data: BPindiv
## 
##      AIC      BIC   logLik deviance df.resid 
##   1430.7   1450.7   -711.4   1422.7     1079 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.4649 -0.7945 -0.6312  1.0715  2.1235 
## 
## Random effects:
##  Groups     Name        Variance Std.Dev.
##  award_name (Intercept) 0.2231   0.4724  
##  award_SA   (Intercept) 0.1660   0.4074  
## Number of obs: 1083, groups:  award_name, 62; award_SA, 19
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -0.51269    0.14555  -3.522 0.000428 ***
## Award_year_scaled  0.07079    0.07194   0.984 0.325116    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Awrd_yr_scl -0.084
plogis(summary(model_gender)$coef[1,1])*100 #calculate % difference between genders at the intercept: 37% female names
## [1] 37.45634
#use the reports package (Makowski et al. 2021) to summarize the analysis
report::report(model_gender)
## We fitted a logistic mixed model (estimated using ML and Nelder-Mead optimizer)
## to predict Gender with Award_year_scaled (formula: Gender ~ Award_year_scaled).
## The model included award_SA as random effects (formula: list(~1 | award_SA, ~1
## | award_name)). The model's total explanatory power is weak (conditional R2 =
## 0.11) and the part related to the fixed effects alone (marginal R2) is of
## 1.36e-03. The model's intercept, corresponding to Award_year_scaled = 0, is at
## -0.51 (95% CI [-0.80, -0.23], p < .001). Within this model:
## 
##   - The effect of Award year scaled is statistically non-significant and positive
## (beta = 0.07, 95% CI [-0.07, 0.21], p = 0.325; Std. beta = 0.07, 95% CI [-0.07,
## 0.21])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.

2.15 Model individual winners online profile across years

Statistical analyses of the gender difference in awardee gender across years - using individual-level data

#table(BPindiv$awardee_profile_shown, useNA = "always")
BPindiv %>% 
  mutate(Profile = case_when(
    endsWith(awardee_profile_shown, "yes") ~ 1,
    endsWith(awardee_profile_shown, "no") ~ 0
    )) -> BPindiv

#Fit generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package). NOte: does not work with SA as a random effect:
model_profile <- glmer(Profile ~ Award_year_scaled + (1|award_name), family = "binomial", data = BPindiv) #with year as a predictor
summary(model_profile) #slope 1.11 
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: Profile ~ Award_year_scaled + (1 | award_name)
##    Data: BPindiv
## 
##      AIC      BIC   logLik deviance df.resid 
##    383.4    398.4   -188.7    377.4     1080 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.7706 -0.0154 -0.0096 -0.0051 10.6139 
## 
## Random effects:
##  Groups     Name        Variance Std.Dev.
##  award_name (Intercept) 95.71    9.783   
## Number of obs: 1083, groups:  award_name, 62
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -9.1033     1.4117  -6.448 1.13e-10 ***
## Award_year_scaled   1.1135     0.2227   5.000 5.75e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Awrd_yr_scl -0.156
#use the reports package (Makowski et al. 2021) to summarize the analysis
report::report(model_profile)
## We fitted a logistic mixed model (estimated using ML and Nelder-Mead optimizer)
## to predict Profile with Award_year_scaled (formula: Profile ~
## Award_year_scaled). The model included award_name as random effect (formula: ~1
## | award_name). The model's total explanatory power is substantial (conditional
## R2 = 0.97) and the part related to the fixed effects alone (marginal R2) is of
## 0.01. The model's intercept, corresponding to Award_year_scaled = 0, is at
## -9.10 (95% CI [-11.87, -6.34], p < .001). Within this model:
## 
##   - The effect of Award year scaled is statistically significant and positive
## (beta = 1.11, 95% CI [0.68, 1.55], p < .001; Std. beta = 1.11, 95% CI [0.68,
## 1.55])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.

2.16 Model individual winners online photo across years

Statistical analyses of the gender difference in awardee gender across years - using individual-level data

#table(BPindiv$awardee_photo_shown, useNA = "always")
BPindiv %>% 
  mutate(Photo = case_when(
    endsWith(awardee_photo_shown, "yes") ~ 1,
    endsWith(awardee_photo_shown, "no") ~ 0
    )) -> BPindiv

#Fit generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package). Note: does not work with SA as a random effect:
model_photo <- glmer(Photo ~ Award_year_scaled + (1|award_name), family = "binomial", data = BPindiv) #with year as a predictor
summary(model_photo) #slope 1.18
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: Photo ~ Award_year_scaled + (1 | award_name)
##    Data: BPindiv
## 
##      AIC      BIC   logLik deviance df.resid 
##    381.3    396.3   -187.6    375.3     1080 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.5501 -0.0102 -0.0027  0.0353  2.1078 
## 
## Random effects:
##  Groups     Name        Variance Std.Dev.
##  award_name (Intercept) 194.9    13.96   
## Number of obs: 1083, groups:  award_name, 62
## 
## Fixed effects:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -9.6806     1.3957  -6.936 4.03e-12 ***
## Award_year_scaled   1.1822     0.2378   4.972 6.63e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Awrd_yr_scl -0.173
#use the reports package (Makowski et al. 2021) to summarize the analysis
report::report(model_photo)
## We fitted a logistic mixed model (estimated using ML and Nelder-Mead optimizer)
## to predict Photo with Award_year_scaled (formula: Photo ~ Award_year_scaled).
## The model included award_name as random effect (formula: ~1 | award_name). The
## model's total explanatory power is substantial (conditional R2 = 0.98) and the
## part related to the fixed effects alone (marginal R2) is of 7.00e-03. The
## model's intercept, corresponding to Award_year_scaled = 0, is at -9.68 (95% CI
## [-12.42, -6.95], p < .001). Within this model:
## 
##   - The effect of Award year scaled is statistically significant and positive
## (beta = 1.18, 95% CI [0.72, 1.65], p < .001; Std. beta = 1.18, 95% CI [0.72,
## 1.65])
## 
## Standardized parameters were obtained by fitting the model on a standardized
## version of the dataset. 95% Confidence Intervals (CIs) and p-values were
## computed using a Wald z-distribution approximation.

2.17 Model award characteristics vs. how old are the awards?

NOTE: Using the year of oldest listed past winner as a proxy of award establishement year.

#Scale the Awardee_list_earliest_year variable to have the mean of 0 and SD of 1
BPdata$Awardee_list_earliest_year_scaled <- scale(BPdata$Awardee_list_earliest_year)

#table(!is.na(BPdata$Awardee_list_earliest_year)) # values available

## try logistic regressions with Awardee_list_earliest_year for the following variables:
#Award_individual, Flexible_eligibility, Inclusivity_statement, Assessors_transparency, Award_integrity_mentioned, Process_transparency, Self_nomination, Letter_required, Feedback_availability, Award_contact_provided, Criteria_transparency, Award_impact_metrics_mentioned, Award_impact_metrics_only, Open_science


## Award_individual
#table
table(BPdata$Award_individual, useNA = "always")
## 
##   no  yes <NA> 
##  156   66    0
#filter and make binomial 
BPdata %>% 
  filter(!is.na(Awardee_list_earliest_year)) %>% 
  mutate(Award_individual_num = recode(Award_individual, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>% 
  ggplot(aes(x = Awardee_list_earliest_year, y = Award_individual_num)) + 
  geom_point(alpha = 0.2) + 
  stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
  xlab("Year of first listed awardee") + 
  ylab("Probability of Award_individual") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Award_individual <- glmer(Award_individual_num ~ Awardee_list_earliest_year_scaled + (1|Scimago_Subject_Area) + (1|Awarding_society), family = "binomial", data = BPdata4) #with year as a predictor
summary(model_Award_individual) #slope ns
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: Award_individual_num ~ Awardee_list_earliest_year_scaled + (1 |  
##     Scimago_Subject_Area) + (1 | Awarding_society)
##    Data: BPdata4
## 
##      AIC      BIC   logLik deviance df.resid 
##    229.9    243.1   -110.9    221.9      199 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.3911 -0.5661 -0.2933  0.6713  2.0516 
## 
## Random effects:
##  Groups               Name        Variance Std.Dev.
##  Awarding_society     (Intercept) 0.2334   0.4831  
##  Scimago_Subject_Area (Intercept) 2.3012   1.5170  
## Number of obs: 203, groups:  Awarding_society, 118; Scimago_Subject_Area, 27
## 
## Fixed effects:
##                                   Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                        -1.0512     0.3854  -2.727  0.00639 **
## Awardee_list_earliest_year_scaled   0.2523     0.2318   1.088  0.27650   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Awrd_lst___ 0.040
#report::report(model_Award_individual) #use the reports package (Makowski et al. 2021) to summarize the analysis


## Flexible_eligibility
#table
table(BPdata$Flexible_eligibility, useNA = "always")
## 
##             no not applicable  not available            yes           <NA> 
##             21            185             11              5              0
#filter and make binomial 
BPdata %>% 
  filter(!is.na(Awardee_list_earliest_year)) %>% 
  filter(Flexible_eligibility == "yes" | Flexible_eligibility == "no") %>%  
  mutate(Flexible_eligibility_num = recode(Flexible_eligibility, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>% 
  ggplot(aes(x = Awardee_list_earliest_year, y = Flexible_eligibility_num)) + 
  geom_point(alpha = 0.2) + 
  stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
  xlab("Year of first listed awardee") + 
  ylab("Probability of Flexible_eligibility") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
#model_Flexible_eligibility <- glmer(Flexible_eligibility_num ~ Awardee_list_earliest_year_scaled + (1|Scimago_Subject_Area) + (1|Awarding_society), family = "binomial", data = BPdata4) #with year as a predictor - NOT CONVERGING with random effects included
model_Flexible_eligibility <- glm(Flexible_eligibility_num ~ Awardee_list_earliest_year_scaled, family = "binomial", data = BPdata4) #with year as a predictor, without random effects included
summary(model_Flexible_eligibility) #slope ns
## 
## Call:
## glm(formula = Flexible_eligibility_num ~ Awardee_list_earliest_year_scaled, 
##     family = "binomial", data = BPdata4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7852  -0.6813  -0.6704  -0.6629   1.7989  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                       -1.32990    0.50324  -2.643  0.00823 **
## Awardee_list_earliest_year_scaled -0.08686    0.48362  -0.180  0.85747   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 24.564  on 23  degrees of freedom
## Residual deviance: 24.532  on 22  degrees of freedom
## AIC: 28.532
## 
## Number of Fisher Scoring iterations: 4
#report::report(model_Flexible_eligibility) #use the reports package (Makowski et al. 2021) to summarize the analysis


## Assessors_transparency
#table
table(BPdata$Assessors_transparency, useNA = "always")
## 
##            no not available           yes          <NA> 
##            98            10           114             0
#filter and make binomial 
BPdata %>% 
  filter(!is.na(Awardee_list_earliest_year)) %>% 
  filter(Assessors_transparency == "yes" | Assessors_transparency == "no") %>%  
  mutate(Assessors_transparency_num = recode(Assessors_transparency, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>% 
  ggplot(aes(x = Awardee_list_earliest_year, y = Assessors_transparency_num)) + 
  geom_point(alpha = 0.2) + 
  stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
  xlab("Year of first listed awardee") + 
  ylab("Probability of Assessors_transparency") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Assessors_transparency <- glmer(Assessors_transparency_num ~ Awardee_list_earliest_year_scaled + (1|Scimago_Subject_Area), family = "binomial", data = BPdata4) #with year as a predictor, + (1|Awarding_society) causes "boundary (singular)"
summary(model_Assessors_transparency) #slope ns
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: Assessors_transparency_num ~ Awardee_list_earliest_year_scaled +  
##     (1 | Scimago_Subject_Area)
##    Data: BPdata4
## 
##      AIC      BIC   logLik deviance df.resid 
##    272.5    282.3   -133.3    266.5      191 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.4059 -0.9351  0.7128  0.9220  1.2300 
## 
## Random effects:
##  Groups               Name        Variance Std.Dev.
##  Scimago_Subject_Area (Intercept) 0.2852   0.534   
## Number of obs: 194, groups:  Scimago_Subject_Area, 27
## 
## Fixed effects:
##                                   Estimate Std. Error z value Pr(>|z|)
## (Intercept)                        0.06852    0.18168   0.377    0.706
## Awardee_list_earliest_year_scaled  0.02803    0.15627   0.179    0.858
## 
## Correlation of Fixed Effects:
##             (Intr)
## Awrd_lst___ 0.012
#report::report(model_Assessors_transparency) #use the reports package (Makowski et al. 2021) to summarize the analysis


## Award_integrity_mentioned - more likely in older awards!
#table
table(BPdata$Award_integrity_mentioned, useNA = "always")
## 
##            no not available           yes          <NA> 
##           192            11            19             0
#filter and make binomial 
BPdata %>% 
  filter(!is.na(Awardee_list_earliest_year)) %>% 
  filter(Award_integrity_mentioned == "yes" | Award_integrity_mentioned == "no") %>%  
  mutate(Award_integrity_mentioned_num = recode(Award_integrity_mentioned, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>% 
  ggplot(aes(x = Awardee_list_earliest_year, y = Award_integrity_mentioned_num)) + 
  geom_point(alpha = 0.2) + 
  stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
  xlab("Year of first listed awardee") + 
  ylab("Probability of Award_integrity_mentioned") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Award_integrity_mentioned <- glmer(Award_integrity_mentioned_num ~ Awardee_list_earliest_year_scaled +  (1|Awarding_society), family = "binomial", data = BPdata4) #with year as a predictor, + (1|Scimago_Subject_Area) causes "boundary (singular)"
summary(model_Award_integrity_mentioned) #slope ns, but close
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: Award_integrity_mentioned_num ~ Awardee_list_earliest_year_scaled +  
##     (1 | Awarding_society)
##    Data: BPdata4
## 
##      AIC      BIC   logLik deviance df.resid 
##     85.6     95.4    -39.8     79.6      190 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.1129 -0.0979 -0.0037 -0.0023  4.6810 
## 
## Random effects:
##  Groups           Name        Variance Std.Dev.
##  Awarding_society (Intercept) 559.1    23.64   
## Number of obs: 193, groups:  Awarding_society, 115
## 
## Fixed effects:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -11.6213     2.6581  -4.372 1.23e-05 ***
## Awardee_list_earliest_year_scaled  -1.1068     0.7514  -1.473    0.141    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Awrd_lst___ 0.657
#report::report(model_Award_integrity_mentioned) #use the reports package (Makowski et al. 2021) to summarize the analysis


## Self_nomination - trend for newer awards
#table
table(BPdata$Self_nomination, useNA = "always")
## 
##            no not available           yes          <NA> 
##           183            11            28             0
#filter and make binomial 
BPdata %>% 
  filter(!is.na(Awardee_list_earliest_year)) %>% 
  filter(Self_nomination == "yes" | Self_nomination == "no") %>%  
  mutate(Self_nomination_num = recode(Self_nomination, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4  %>% 
  ggplot(aes(x = Awardee_list_earliest_year, y = Self_nomination_num)) + 
  geom_point(alpha = 0.2) + 
  stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
  xlab("Year of first listed awardee") + 
  ylab("Probability of Self_nomination") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Self_nomination <- glmer(Self_nomination_num ~ Awardee_list_earliest_year_scaled + (1|Scimago_Subject_Area), family = "binomial", data = BPdata4) #with year as a predictor
summary(model_Self_nomination) #slope ns, but trend for newer awards to mention self-nominations
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: Self_nomination_num ~ Awardee_list_earliest_year_scaled + (1 |  
##     Scimago_Subject_Area)
##    Data: BPdata4
## 
##      AIC      BIC   logLik deviance df.resid 
##    154.0    163.8    -74.0    148.0      190 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -0.6315 -0.4125 -0.3501 -0.2592  4.8781 
## 
## Random effects:
##  Groups               Name        Variance Std.Dev.
##  Scimago_Subject_Area (Intercept) 0.3627   0.6023  
## Number of obs: 193, groups:  Scimago_Subject_Area, 27
## 
## Fixed effects:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -2.0653     0.3095  -6.673 2.51e-11 ***
## Awardee_list_earliest_year_scaled   0.5658     0.3225   1.754   0.0793 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Awrd_lst___ -0.343
#report::report(model_Self_nomination) #use the reports package (Makowski et al. 2021) to summarize the analysis


## Letter_required - less and less recently
#table
table(BPdata$Letter_required, useNA = "always")
## 
##            no not available           yes          <NA> 
##           173            11            38             0
#filter and make binomial 
BPdata %>% 
  filter(!is.na(Awardee_list_earliest_year)) %>% 
  filter(Letter_required == "yes" | Letter_required == "no") %>%  
  mutate(Letter_required_num = recode(Letter_required, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4  %>% 
  ggplot(aes(x = Awardee_list_earliest_year, y = Letter_required_num)) + 
  geom_point(alpha = 0.2) + 
  stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
  xlab("Year of first listed awardee") + 
  ylab("Probability of Letter_required") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Letter_required <- glmer(Letter_required_num ~ Awardee_list_earliest_year_scaled + (1|Scimago_Subject_Area) + (1|Awarding_society), family = "binomial", data = BPdata4) #with year as a predictor
summary(model_Letter_required) #slope significant negative -0.5431 - less common over the years
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: Letter_required_num ~ Awardee_list_earliest_year_scaled + (1 |  
##     Scimago_Subject_Area) + (1 | Awarding_society)
##    Data: BPdata4
## 
##      AIC      BIC   logLik deviance df.resid 
##    176.8    189.9    -84.4    168.8      189 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.6478 -0.4311 -0.3374 -0.2687  2.6569 
## 
## Random effects:
##  Groups               Name        Variance Std.Dev.
##  Awarding_society     (Intercept) 0.1654   0.4067  
##  Scimago_Subject_Area (Intercept) 0.5556   0.7454  
## Number of obs: 193, groups:  Awarding_society, 115; Scimago_Subject_Area, 27
## 
## Fixed effects:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -1.7578     0.3110  -5.653 1.58e-08 ***
## Awardee_list_earliest_year_scaled  -0.5425     0.1929  -2.812  0.00492 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Awrd_lst___ 0.208
#report::report(model_Letter_required) #use the reports package (Makowski et al. 2021) to summarize the analysis


## Award_contact_provided
#table
table(BPdata$Award_contact_provided, useNA = "always")
## 
##            no not available           yes          <NA> 
##           173            11            38             0
#filter and make binomial 
BPdata %>% 
  filter(!is.na(Awardee_list_earliest_year)) %>% 
  filter(Award_contact_provided == "yes" | Award_contact_provided == "no") %>%  
  mutate(Award_contact_provided_num = recode(Award_contact_provided, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4  %>% 
  ggplot(aes(x = Awardee_list_earliest_year, y = Award_contact_provided_num)) + 
  geom_point(alpha = 0.2) + 
  stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
  xlab("Year of first listed awardee") + 
  ylab("Probability of Award_individual") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Award_contact_provided <- glm(Award_contact_provided_num ~ Awardee_list_earliest_year_scaled, family = "binomial", data = BPdata4) #with year as a predictor - but fails with random effects
summary(model_Award_contact_provided) #slope signif negative -0.3314 - less likely recently
## 
## Call:
## glm(formula = Award_contact_provided_num ~ Awardee_list_earliest_year_scaled, 
##     family = "binomial", data = BPdata4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0404  -0.6292  -0.5738  -0.5426   1.9943  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -1.5473     0.1923  -8.048 8.41e-16 ***
## Awardee_list_earliest_year_scaled  -0.3283     0.1625  -2.021   0.0433 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182.74  on 192  degrees of freedom
## Residual deviance: 178.87  on 191  degrees of freedom
## AIC: 182.87
## 
## Number of Fisher Scoring iterations: 4
#report::report(model_Award_contact_provided) #use the reports package (Makowski et al. 2021) to summarize the analysis


## Criteria_transparency
#table
table(BPdata$Criteria_transparency, useNA = "always")
## 
##            no not available           yes          <NA> 
##           170            12            40             0
#filter and make binomial 
BPdata %>% 
filter(!is.na(Awardee_list_earliest_year)) %>% 
  filter(Criteria_transparency == "yes" | Criteria_transparency == "no") %>%  
  mutate(Criteria_transparency_num = recode(Criteria_transparency, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>% 
  ggplot(aes(x = Awardee_list_earliest_year, y = Criteria_transparency_num)) + 
  geom_point(alpha = 0.2) + 
  stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
  xlab("Year of first listed awardee") + 
  ylab("Probability of Criteria_transparency") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Criteria_transparency <- glm(Criteria_transparency_num ~ Awardee_list_earliest_year_scaled, family = "binomial", data = BPdata4) #with year as a predictor
summary(model_Criteria_transparency) #slope ns 
## 
## Call:
## glm(formula = Criteria_transparency_num ~ Awardee_list_earliest_year_scaled, 
##     family = "binomial", data = BPdata4)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7817  -0.7168  -0.6488  -0.4688   2.2484  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -1.4415     0.1891  -7.623 2.48e-14 ***
## Awardee_list_earliest_year_scaled   0.4048     0.2412   1.678   0.0934 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 191.04  on 191  degrees of freedom
## Residual deviance: 187.57  on 190  degrees of freedom
## AIC: 191.57
## 
## Number of Fisher Scoring iterations: 4
#report::report(model_Criteria_transparency) #use the reports package (Makowski et al. 2021) to summarize the analysis


## Award_impact_metrics_mentioned - mostly last 10 years
#table
table(BPdata$Award_impact_metrics_mentioned, useNA = "always")
## 
##            no not available           yes          <NA> 
##           190            11            21             0
BPdata %>% 
  filter(!is.na(Awardee_list_earliest_year)) %>% 
  filter(Award_impact_metrics_mentioned == "yes" | Award_impact_metrics_mentioned == "no") %>%  
  mutate(Award_impact_metrics_mentioned_num = recode(Award_impact_metrics_mentioned, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>% 
  ggplot(aes(x = Awardee_list_earliest_year, y = Award_impact_metrics_mentioned_num)) + 
  geom_point(alpha = 0.2) + 
  stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
  xlab("Year of first listed awardee") + 
  ylab("Probability of Award_impact_metrics_mentioned") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Award_impact_metrics_mentioned <- glmer(Award_impact_metrics_mentioned_num ~ Awardee_list_earliest_year_scaled + (1|Awarding_society), family = "binomial", data = BPdata4) #with year as a predictor
summary(model_Award_impact_metrics_mentioned) #slope signif 4.930 - recently mentioned more often
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## Award_impact_metrics_mentioned_num ~ Awardee_list_earliest_year_scaled +  
##     (1 | Awarding_society)
##    Data: BPdata4
## 
##      AIC      BIC   logLik deviance df.resid 
##     90.1     99.9    -42.1     84.1      190 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -0.99218 -0.01514 -0.00158 -0.00009  2.46961 
## 
## Random effects:
##  Groups           Name        Variance Std.Dev.
##  Awarding_society (Intercept) 916.8    30.28   
## Number of obs: 193, groups:  Awarding_society, 115
## 
## Fixed effects:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -14.264      3.262  -4.373 1.23e-05 ***
## Awardee_list_earliest_year_scaled    4.870      2.256   2.158   0.0309 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Awrd_lst___ -0.570
#report::report(model_Award_impact_metrics_mentioned) #use the reports package (Makowski et al. 2021) to summarize the analysis


## Award_impact_metrics_only - seem to be from the last 5 years!!
#table
table(BPdata$Award_impact_metrics_only, useNA = "always")
## 
##            no not available           yes          <NA> 
##           203            11             8             0
#filter and make binomial 
BPdata %>% 
  filter(!is.na(Awardee_list_earliest_year)) %>% 
  filter(Award_impact_metrics_only == "yes" | Award_impact_metrics_only == "no") %>%  
  mutate(Award_impact_metrics_only_num = recode(Award_impact_metrics_only, yes = 1, no = 0)) -> BPdata4
#plot
BPdata4 %>% 
  ggplot(aes(x = Awardee_list_earliest_year, y = Award_impact_metrics_only_num)) + 
  geom_point(alpha = 0.2) + 
  stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
  xlab("Year of first listed awardee") + 
  ylab("Probability of Award_impact_metrics_only") +
  theme_minimal() 
## `geom_smooth()` using formula = 'y ~ x'

#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Award_impact_metrics_only <- glm(Award_impact_metrics_only_num ~ Awardee_list_earliest_year_scaled, family = "binomial", data = BPdata4) #with year as a predictor, fails with random effects
summary(model_Award_impact_metrics_only) #slope signif 3.782 - more common recently
## 
## Call:
## glm(formula = Award_impact_metrics_only_num ~ Awardee_list_earliest_year_scaled, 
##     family = "binomial", data = BPdata4)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.69414  -0.36191  -0.16275  -0.03612   2.44021  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         -5.154      1.289  -3.998  6.4e-05 ***
## Awardee_list_earliest_year_scaled    3.784      1.705   2.219   0.0265 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 66.596  on 192  degrees of freedom
## Residual deviance: 54.872  on 191  degrees of freedom
## AIC: 58.872
## 
## Number of Fisher Scoring iterations: 8
#report::report(model_Award_impact_metrics_only) #use the reports package (Makowski et al. 2021) to summarize the analysis


## Open_science
#table
table(BPdata$Open_science, useNA = "always")
## 
##            no not available           yes          <NA> 
##           210            11             1             0
#filter and make binomial 
BPdata %>% 
  filter(!is.na(Awardee_list_earliest_year)) %>% 
  filter(Open_science == "yes" | Open_science == "no") %>%  
  mutate(Open_science_num = recode(Open_science, yes = 1, no = 0))  -> BPdata4
#plot
BPdata4 %>%
  ggplot(aes(x = Awardee_list_earliest_year, y = Open_science_num)) + 
  geom_point(alpha = 0.2) + 
  stat_smooth(method = "glm", method.args = list(family = binomial), se = TRUE) +
  xlab("Year of first listed awardee") + 
  ylab("Probability of Open_science") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

#model - fit a generalised mixed model with binomial error family and with logit link function, award as a random effect (using glmer from lme4 package):
model_Open_science <- glm(Open_science_num ~ Awardee_list_earliest_year_scaled, family = "binomial", data = BPdata4) #with year as a predictor, fails with random effects
summary(model_Open_science) #slope ns
## 
## Call:
## glm(formula = Open_science_num ~ Awardee_list_earliest_year_scaled, 
##     family = "binomial", data = BPdata4)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -0.53186  -0.08916  -0.07063  -0.05999   2.98784  
## 
## Coefficients:
##                                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -5.7776     1.3438  -4.299 1.71e-05 ***
## Awardee_list_earliest_year_scaled  -0.7611     0.5281  -1.441     0.15    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 12.520  on 192  degrees of freedom
## Residual deviance: 10.936  on 191  degrees of freedom
## AIC: 14.936
## 
## Number of Fisher Scoring iterations: 8
#report::report(model_Open_science) #use the reports package (Makowski et al. 2021) to summarize the analysis

Table of logistic regression results? ## try logistic regressions against Awardee_list_earliest_year for the following variables: #Award_individual, Flexible_eligibility, Inclusivity_statement, Assessors_transparency, Award_integrity_mentioned, Process_transparency, Self_nomination, Letter_required, Feedback_availability, Award_contact_provided, Criteria_transparency, Award_impact_metrics_mentioned, Award_impact_metrics_only, Open_science

report::report_effectsize(model_Award_impact_metrics_mentioned)
report::report_effectsize(model_Open_science)

r <- report::report(model_Open_science)
r <- report::report_table(model_Open_science)
r <- report::report_model(model_Open_science)
r
str(r)
summary(r)
names(as.data.frame(r))

str(model_Award_individual)
report::report(model_Award_individual)$Effects

TRY tables based on: https://cran.r-project.org/web/packages/sjPlot/vignettes/tab_mixed.html

library(sjPlot)
tab_model(model_Award_individual)
  Award_individual_num
Predictors Odds Ratios CI p
(Intercept) 0.35 0.16 – 0.74 0.006
Awardee list earliest
year scaled
1.29 0.82 – 2.03 0.276
Random Effects
σ2 3.29
τ00 Awarding_society 0.23
τ00 Scimago_Subject_Area 2.30
ICC 0.44
N Scimago_Subject_Area 27
N Awarding_society 115
Observations 203
Marginal R2 / Conditional R2 0.011 / 0.441